Spaces:

bhsinghgrid
/

DevaFlow-space

Sleeping

App Files Files Community

bhsinghgrid commited on 9 days ago

Commit

29e5bf8

verified ·

1 Parent(s): 35a343d

Upgrade UI: model selection + tasks 1-5 + analysis modules

Browse files

Files changed (41) hide show

.gitattributes +1 -0
__pycache__/app.cpython-311.pyc +0 -0
analysis/__pycache__/run_analysis.cpython-311.pyc +0 -0
analysis/attention_viz.py +621 -0
analysis/concept_vectors.py +637 -0
analysis/kv_cache_benchmark.py +451 -0
analysis/outputs/task1_kv_cache.txt +23 -0
analysis/outputs/task2_all_layers_t0.png +0 -0
analysis/outputs/task2_attn_evolution.png +0 -0
analysis/outputs/task2_attn_t0.png +0 -0
analysis/outputs/task2_attn_t127.png +0 -0
analysis/outputs/task2_examples/example_1_attn_t0.png +0 -0
analysis/outputs/task2_examples/example_2_attn_t0.png +0 -0
analysis/outputs/task2_examples/example_3_attn_t0.png +0 -0
analysis/outputs/task2_examples/example_4_attn_t0.png +0 -0
analysis/outputs/task2_examples/example_5_attn_t0.png +0 -0
analysis/outputs/task2_report.txt +100 -0
analysis/outputs/task2_semantic_drift.png +0 -0
analysis/outputs/task2_source_alignment.png +0 -0
analysis/outputs/task3_concept_space.png +3 -0
analysis/outputs/task3_diversity_direction.npy +3 -0
analysis/outputs/task3_report.txt +12 -0
analysis/outputs/task5_quality_classifier.pt +3 -0
analysis/outputs/task5_quality_data.npz +3 -0
analysis/outputs_multi/results__d3pm_cross_attention_neg_False/task1/task1_kv_cache.txt +10 -0
analysis/outputs_multi/results__d3pm_cross_attention_neg_True/task1/task1_kv_cache.txt +10 -0
analysis/quality_classifier.py +723 -0
analysis/reports/README.md +19 -0
analysis/reports/task1_kv_cache_report.md +99 -0
analysis/reports/task2_attention_drift_report.md +112 -0
analysis/reports/task3_concept_vectors_report.md +96 -0
analysis/reports/task4_step_ablation_report.md +89 -0
analysis/reports/task5_quality_guidance_report.md +101 -0
analysis/run_analysis.py +466 -0
analysis/run_tasks_except4_all_models.py +123 -0
analysis/semantic_drift.py +569 -0
analysis/step_ablation.py +582 -0
app.py +487 -175
data/__init__.py +0 -0
data/dataset.py +152 -0
requirements.txt +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+analysis/outputs/task3_concept_space.png filter=lfs diff=lfs merge=lfs -text

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (29.2 kB). View file

analysis/__pycache__/run_analysis.cpython-311.pyc ADDED Viewed

Binary file (32 kB). View file

analysis/attention_viz.py ADDED Viewed

	@@ -0,0 +1,621 @@

+# """
+# analysis/attention_viz.py
+# ==========================
+# Task 2: Attention weight capture and visualization across diffusion steps.
+#
+# How it works (no retraining needed):
+#   MultiHeadAttention now has two attributes:
+#     - capture_weights: bool  — set True to start storing weights
+#     - last_attn_weights: Tensor — [B, n_heads, Lq, Lk], updated each forward call
+#
+#   AttentionCapture:
+#     - Sets capture_weights=True on all cross-attention layers
+#     - Hooks into generate_cached() to record weights at every diffusion step
+#     - Returns a dict: {t_val: [layer_0_weights, layer_1_weights, ...]}
+#
+#   Visualization:
+#     - plot_attn_heatmap(): shows src→tgt alignment at a single step
+#     - plot_attn_evolution(): shows how one src→tgt pair evolves over T steps
+#     - plot_all_layers(): grid of heatmaps per layer at a given step
+#
+# Usage:
+#     from analysis.attention_viz import AttentionCapture, plot_attn_heatmap
+#
+#     capturer = AttentionCapture(model)
+#     weights  = capturer.capture(src_ids, src_tokens, tgt_tokens)
+#     plot_attn_heatmap(weights, step=0, layer=0, src_tokens=..., tgt_tokens=...)
+# """
+#
+# import torch
+# import numpy as np
+# import os
+# from typing import List, Dict, Optional
+#
+#
+# # ── Attention capture ─────────────────────────────────────────────────
+#
+# class AttentionCapture:
+#     """
+#     Captures cross-attention weights from all decoder layers at every
+#     diffusion step during generate_cached().
+#
+#     Works by:
+#       1. Setting capture_weights=True on each DecoderBlock.cross_attn
+#       2. Running generate_cached() (encoder runs once via KV cache)
+#       3. After each denoising step, reading last_attn_weights from each layer
+#       4. Storing as {t_val: list_of_layer_weights}
+#
+#     Zero retraining required — uses the flag added to MultiHeadAttention.
+#     """
+#
+#     def __init__(self, model):
+#         """
+#         Args:
+#             model : SanskritModel wrapper (must be D3PMCrossAttention)
+#         """
+#         self.model       = model
+#         self.inner       = model.model   # D3PMCrossAttention
+#         self._cross_attns = []
+#
+#         # Collect all cross-attention modules from decoder blocks
+#         if hasattr(self.inner, 'decoder_blocks'):
+#             for block in self.inner.decoder_blocks:
+#                 if hasattr(block, 'cross_attn'):
+#                     self._cross_attns.append(block.cross_attn)
+#
+#         if not self._cross_attns:
+#             raise ValueError(
+#                 "No cross-attention layers found. "
+#                 "AttentionCapture only works with D3PMCrossAttention."
+#             )
+#
+#         print(f"AttentionCapture: found {len(self._cross_attns)} cross-attention layers.")
+#
+#     def _enable(self):
+#         """Turn on weight capture for all cross-attention layers."""
+#         for ca in self._cross_attns:
+#             ca.capture_weights = True
+#
+#     def _disable(self):
+#         """Turn off weight capture (restores zero overhead)."""
+#         for ca in self._cross_attns:
+#             ca.capture_weights = False
+#             ca.last_attn_weights = None
+#
+#     def _read_weights(self) -> List[np.ndarray]:
+#         """
+#         Read current last_attn_weights from all layers.
+#         Returns list of [B, n_heads, Lq, Lk] arrays — one per layer.
+#         Averages over heads to produce [B, Lq, Lk].
+#         """
+#         weights = []
+#         for ca in self._cross_attns:
+#             if ca.last_attn_weights is not None:
+#                 # Average over attention heads → [B, Lq, Lk]
+#                 w = ca.last_attn_weights.float().mean(dim=1)
+#                 weights.append(w.numpy())
+#         return weights
+#
+#     @torch.no_grad()
+#     def capture(
+#         self,
+#         src:        torch.Tensor,
+#         capture_every: int = 10,
+#     ) -> Dict[int, List[np.ndarray]]:
+#         """
+#         Run full generation while capturing attention at every `capture_every` steps.
+#
+#         Args:
+#             src           : [1, src_len] or [B, src_len] IAST token ids
+#             capture_every : capture weights every N steps (default 10)
+#                             Use 1 to capture every step (slow, high memory).
+#
+#         Returns:
+#             step_weights : dict mapping t_val → list of [B, Lq, Lk] arrays
+#                            one array per decoder layer
+#                            keys are t values: T-1, T-1-N, ..., 0
+#
+#         Example:
+#             weights = capturer.capture(src_ids, capture_every=10)
+#             # weights[127] = layer weights at t=127 (heavy noise)
+#             # weights[0]   = layer weights at t=0   (clean output)
+#         """
+#         if src.dim() == 1:
+#             src = src.unsqueeze(0)
+#
+#         inner  = self.inner
+#         T      = inner.scheduler.num_timesteps
+#         device = src.device
+#
+#         # KV cache: encode source once
+#         memory, src_pad_mask = inner.encode_source(src)
+#
+#         B       = src.shape[0]
+#         tgt_len = inner.max_seq_len
+#         mask_id = inner.mask_token_id
+#
+#         x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+#         hint   = None
+#
+#         step_weights: Dict[int, List[np.ndarray]] = {}
+#
+#         self._enable()
+#         try:
+#             inner.eval()
+#             for t_val in range(T - 1, -1, -1):
+#                 t       = torch.full((B,), t_val, dtype=torch.long, device=device)
+#                 is_last = (t_val == 0)
+#
+#                 logits, _ = inner.forward_cached(
+#                     memory, src_pad_mask, x0_est, t,
+#                     x0_hint=hint, inference_mode=True,
+#                 )
+#
+#                 # Capture at this step if scheduled or it's the last step
+#                 if (T - 1 - t_val) % capture_every == 0 or is_last:
+#                     step_weights[t_val] = self._read_weights()
+#
+#                 import torch.nn.functional as F
+#                 probs  = F.softmax(logits / 0.8, dim=-1)
+#                 x0_est = torch.argmax(probs, dim=-1) if is_last else \
+#                          _multinomial_sample(probs)
+#                 hint   = x0_est
+#
+#         finally:
+#             self._disable()   # always restore — even if exception raised
+#
+#         print(f"Captured attention at {len(step_weights)} steps "
+#               f"({len(self._cross_attns)} layers each).")
+#         return step_weights
+#
+#
+# def _multinomial_sample(probs: torch.Tensor) -> torch.Tensor:
+#     B, L, V = probs.shape
+#     flat    = probs.view(B * L, V).clamp(min=1e-9)
+#     flat    = flat / flat.sum(dim=-1, keepdim=True)
+#     return torch.multinomial(flat, 1).squeeze(-1).view(B, L)
+#
+#
+# # ── Visualization ─────────────────────────────────────────────────────
+#
+# def plot_attn_heatmap(
+#     step_weights:  Dict[int, List[np.ndarray]],
+#     t_val:         int,
+#     layer:         int,
+#     src_tokens:    List[str],
+#     tgt_tokens:    List[str],
+#     sample_idx:    int  = 0,
+#     save_path:     Optional[str] = None,
+#     title:         Optional[str] = None,
+# ):
+#     """
+#     Plot cross-attention heatmap for a single step and layer.
+#
+#     X-axis = source (IAST) tokens
+#     Y-axis = target (Devanagari) positions
+#     Color  = attention weight (brighter = stronger attention)
+#
+#     Args:
+#         step_weights : output of AttentionCapture.capture()
+#         t_val        : which diffusion step to visualize
+#         layer        : which decoder layer (0 = first, -1 = last)
+#         src_tokens   : list of IAST token strings for x-axis labels
+#         tgt_tokens   : list of Devanagari token strings for y-axis labels
+#         sample_idx   : which batch item to visualize (default 0)
+#         save_path    : if given, save figure to this path
+#         title        : custom plot title
+#     """
+#     try:
+#         import matplotlib.pyplot as plt
+#         import matplotlib.ticker as ticker
+#     except ImportError:
+#         print("pip install matplotlib to use visualization functions.")
+#         return
+#
+#     if t_val not in step_weights:
+#         available = sorted(step_weights.keys())
+#         raise ValueError(
+#             f"t_val={t_val} not in captured steps. "
+#             f"Available: {available[:5]}...{available[-5:]}"
+#         )
+#
+#     layers  = step_weights[t_val]
+#     weights = layers[layer][sample_idx]   # [Lq, Lk]
+#
+#     # Trim to actual token lengths
+#     n_src = min(len(src_tokens), weights.shape[1])
+#     n_tgt = min(len(tgt_tokens), weights.shape[0])
+#     weights = weights[:n_tgt, :n_src]
+#
+#     fig, ax = plt.subplots(figsize=(max(8, n_src * 0.4), max(6, n_tgt * 0.35)))
+#     im = ax.imshow(weights, aspect='auto', cmap='YlOrRd', interpolation='nearest')
+#
+#     ax.set_xticks(range(n_src))
+#     ax.set_xticklabels(src_tokens[:n_src], rotation=45, ha='right', fontsize=9)
+#     ax.set_yticks(range(n_tgt))
+#     ax.set_yticklabels(tgt_tokens[:n_tgt], fontsize=9)
+#
+#     ax.set_xlabel("Source (IAST)", fontsize=11)
+#     ax.set_ylabel("Target position (Devanagari)", fontsize=11)
+#
+#     plot_title = title or f"Cross-Attention  |  t={t_val}  |  Layer {layer}"
+#     ax.set_title(plot_title, fontsize=12, pad=10)
+#
+#     plt.colorbar(im, ax=ax, label="Attention weight")
+#     plt.tight_layout()
+#
+#     if save_path:
+#         os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+#         plt.savefig(save_path, dpi=150, bbox_inches='tight')
+#         print(f"Saved: {save_path}")
+#     else:
+#         plt.show()
+#     plt.close()
+#
+#
+# def plot_attn_evolution(
+#     step_weights:  Dict[int, List[np.ndarray]],
+#     src_token_idx: int,
+#     tgt_token_idx: int,
+#     layer:         int = -1,
+#     sample_idx:    int = 0,
+#     src_token_str: str = "",
+#     tgt_token_str: str = "",
+#     save_path:     Optional[str] = None,
+# ):
+#     """
+#     Plot how attention between one specific src↔tgt token pair evolves
+#     across all captured diffusion steps (T → 0).
+#
+#     Reveals whether a token pair is 'locked' (stable from early steps)
+#     or 'flexible' (weight fluctuates until final steps).
+#
+#     Args:
+#         step_weights  : output of AttentionCapture.capture()
+#         src_token_idx : index of source token to track
+#         tgt_token_idx : index of target position to track
+#         layer         : decoder layer index
+#         sample_idx    : batch item
+#         src_token_str : string label for the source token (for plot title)
+#         tgt_token_str : string label for the target token (for plot title)
+#         save_path     : if given, save figure to this path
+#     """
+#     try:
+#         import matplotlib.pyplot as plt
+#     except ImportError:
+#         print("pip install matplotlib to use visualization functions.")
+#         return
+#
+#     t_vals  = sorted(step_weights.keys(), reverse=True)  # T-1 → 0
+#     weights = []
+#
+#     for t_val in t_vals:
+#         layers = step_weights[t_val]
+#         w      = layers[layer][sample_idx]   # [Lq, Lk]
+#         if tgt_token_idx < w.shape[0] and src_token_idx < w.shape[1]:
+#             weights.append(w[tgt_token_idx, src_token_idx])
+#         else:
+#             weights.append(0.0)
+#
+#     fig, ax = plt.subplots(figsize=(12, 4))
+#     ax.plot(range(len(t_vals)), weights, linewidth=1.5, color='steelblue')
+#     ax.fill_between(range(len(t_vals)), weights, alpha=0.2, color='steelblue')
+#
+#     # Mark every 10th step on x-axis
+#     step_labels = [str(t) if i % max(1, len(t_vals)//10) == 0 else ""
+#                    for i, t in enumerate(t_vals)]
+#     ax.set_xticks(range(len(t_vals)))
+#     ax.set_xticklabels(step_labels, fontsize=8)
+#     ax.set_xlabel("Diffusion step (T → 0)", fontsize=11)
+#     ax.set_ylabel("Attention weight", fontsize=11)
+#
+#     pair_str = f"src[{src_token_idx}]={src_token_str!r} → tgt[{tgt_token_idx}]={tgt_token_str!r}"
+#     ax.set_title(f"Attention evolution  |  {pair_str}  |  Layer {layer}", fontsize=11)
+#     ax.set_xlim(0, len(t_vals) - 1)
+#     ax.set_ylim(0, None)
+#     plt.tight_layout()
+#
+#     if save_path:
+#         os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+#         plt.savefig(save_path, dpi=150, bbox_inches='tight')
+#         print(f"Saved: {save_path}")
+#     else:
+#         plt.show()
+#     plt.close()
+#
+#
+# def plot_all_layers(
+#     step_weights: Dict[int, List[np.ndarray]],
+#     t_val:        int,
+#     src_tokens:   List[str],
+#     tgt_tokens:   List[str],
+#     sample_idx:   int          = 0,
+#     save_path:    Optional[str] = None,
+# ):
+#     """
+#     Plot attention heatmaps for ALL decoder layers at a single diffusion step.
+#     Shows how different layers specialize their attention patterns.
+#     """
+#     try:
+#         import matplotlib.pyplot as plt
+#     except ImportError:
+#         print("pip install matplotlib to use visualization functions.")
+#         return
+#
+#     layers  = step_weights[t_val]
+#     n_layers = len(layers)
+#     n_cols   = min(4, n_layers)
+#     n_rows   = (n_layers + n_cols - 1) // n_cols
+#
+#     fig, axes = plt.subplots(n_rows, n_cols,
+#                               figsize=(n_cols * 5, n_rows * 4))
+#     axes = np.array(axes).flatten() if n_layers > 1 else [axes]
+#
+#     n_src = min(len(src_tokens), layers[0][sample_idx].shape[1])
+#     n_tgt = min(len(tgt_tokens), layers[0][sample_idx].shape[0])
+#
+#     for i, (ax, layer_w) in enumerate(zip(axes, layers)):
+#         w  = layer_w[sample_idx][:n_tgt, :n_src]
+#         im = ax.imshow(w, aspect='auto', cmap='YlOrRd', interpolation='nearest',
+#                        vmin=0, vmax=w.max())
+#         ax.set_title(f"Layer {i}", fontsize=10)
+#         ax.set_xticks(range(n_src))
+#         ax.set_xticklabels(src_tokens[:n_src], rotation=45, ha='right', fontsize=7)
+#         ax.set_yticks(range(n_tgt))
+#         ax.set_yticklabels(tgt_tokens[:n_tgt], fontsize=7)
+#
+#     for ax in axes[n_layers:]:
+#         ax.set_visible(False)
+#
+#     fig.suptitle(f"All layers at t={t_val}", fontsize=13, y=1.02)
+#     plt.tight_layout()
+#
+#     if save_path:
+#         os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+#         plt.savefig(save_path, dpi=150, bbox_inches='tight')
+#         print(f"Saved: {save_path}")
+#     else:
+#         plt.show()
+#     plt.close()
+"""
+analysis/task2_full.py
+=====================
+FULL Task 2 implementation:
+✔ Attention trajectory (already yours)
+✔ BERTScore over diffusion steps
+✔ Semantic drift metric
+✔ Locked vs flexible token detection
+✔ TF-IDF vs attention stability correlation
+"""
+import torch
+import numpy as np
+from typing import Dict, List
+from collections import defaultdict
+# Optional metrics
+from sklearn.feature_extraction.text import TfidfVectorizer
+try:
+    import evaluate
+    bertscore = evaluate.load("bertscore")
+    USE_BERT = True
+except:
+    USE_BERT = False
+# ─────────────────────────────────────────────────────────────
+# 1. ATTENTION CAPTURE (FIXED VERSION)
+# ─────────────────────────────────────────────────────────────
+class AttentionCapture:
+    def __init__(self, model):
+        self.model = model
+        self.inner = model.model
+        self.cross_attns = []
+        for block in self.inner.decoder_blocks:
+            if hasattr(block, "cross_attn"):
+                self.cross_attns.append(block.cross_attn)
+    def _enable(self):
+        for ca in self.cross_attns:
+            ca.capture_weights = True
+    def _disable(self):
+        for ca in self.cross_attns:
+            ca.capture_weights = False
+            ca.last_attn_weights = None
+    def _read(self):
+        weights = []
+        for ca in self.cross_attns:
+            if ca.last_attn_weights is not None:
+                w = ca.last_attn_weights.mean(dim=1)  # avg heads
+                weights.append(w.cpu().numpy())
+        return weights
+    @torch.no_grad()
+    def run(self, src_ids):
+        inner = self.inner
+        T = inner.scheduler.num_timesteps
+        device = src_ids.device
+        memory, mask = inner.encode_source(src_ids)
+        x = torch.full(
+            (1, inner.max_seq_len),
+            inner.mask_token_id,
+            dtype=torch.long,
+            device=device
+        )
+        hint = None
+        step_weights = {}
+        step_outputs = {}
+        self._enable()
+        try:
+            for t_val in range(T - 1, -1, -1):
+                t = torch.tensor([t_val], device=device)
+                logits, _ = inner.forward_cached(
+                    memory, mask, x, t, x0_hint=hint, inference_mode=True
+                )
+                probs = torch.softmax(logits, dim=-1)
+                x = torch.argmax(probs, dim=-1)
+                step_weights[t_val] = self._read()
+                step_outputs[t_val] = x.clone()
+                hint = x
+        finally:
+            self._disable()
+        return step_weights, step_outputs
+# ─────────────────────────────────────────────────────────────
+# 2. BERTScore + Semantic Drift
+# ─────────────────────────────────────────────────────────────
+def compute_trajectory_metrics(
+    step_outputs,
+    tgt_tokenizer,
+    reference_text
+):
+    trajectory = []
+    for t, ids in step_outputs.items():
+        text = tgt_tokenizer.decode(
+            [x for x in ids[0].tolist() if x > 4]
+        )
+        if USE_BERT:
+            score = bertscore.compute(
+                predictions=[text],
+                references=[reference_text],
+                lang="hi"
+            )["f1"][0]
+        else:
+            score = 0.0
+        drift = 1.0 - score
+        trajectory.append({
+            "step": t,
+            "text": text,
+            "bert": score,
+            "drift": drift
+        })
+    return sorted(trajectory, key=lambda x: -x["step"])
+# ─────────────────────────────────────────────────────────────
+# 3. LOCKED vs FLEXIBLE TOKENS
+# ─────────────────────────────────────────────────────────────
+def analyze_token_stability(step_weights):
+    """
+    Measure variance of attention over time
+    """
+    token_stability = defaultdict(list)
+    for t, layers in step_weights.items():
+        last_layer = layers[-1][0]  # [Lq, Lk]
+        # max attention source index per target token
+        align = np.argmax(last_layer, axis=1)
+        for tgt_idx, src_idx in enumerate(align):
+            token_stability[tgt_idx].append(src_idx)
+    results = {}
+    for tgt_idx, src_seq in token_stability.items():
+        changes = sum(
+            1 for i in range(1, len(src_seq))
+            if src_seq[i] != src_seq[i-1]
+        )
+        if changes <= 2:
+            results[tgt_idx] = "LOCKED"
+        else:
+            results[tgt_idx] = "FLEXIBLE"
+    return results
+# ─────────────────────────────────────────────────────────────
+# 4. TF-IDF vs ATTENTION STABILITY
+# ─────────────────────────���───────────────────────────────────
+def tfidf_attention_correlation(src_text, step_weights):
+    vectorizer = TfidfVectorizer()
+    tfidf = vectorizer.fit_transform([src_text]).toarray()[0]
+    # Avg attention over steps
+    attn_scores = None
+    for t, layers in step_weights.items():
+        w = layers[-1][0]  # last layer
+        avg = w.mean(axis=0)  # per source token
+        if attn_scores is None:
+            attn_scores = avg
+        else:
+            attn_scores += avg
+    attn_scores /= len(step_weights)
+    # Correlation
+    min_len = min(len(tfidf), len(attn_scores))
+    corr = np.corrcoef(tfidf[:min_len], attn_scores[:min_len])[0, 1]
+    return corr
+# ─────────────────────────────────────────────────────────────
+# 5. FULL PIPELINE
+# ─────────────────────────────────────────────────────────────
+def run_task2_analysis(
+    text,
+    model,
+    src_tokenizer,
+    tgt_tokenizer,
+    device
+):
+    src_ids = torch.tensor(
+        [src_tokenizer.encode(text)],
+        device=device
+    )
+    capturer = AttentionCapture(model)
+    # Step 1: Capture
+    step_weights, step_outputs = capturer.run(src_ids)
+    # Step 2: Metrics
+    trajectory = compute_trajectory_metrics(
+        step_outputs,
+        tgt_tokenizer,
+        reference_text=text   # transliteration task
+    )
+    # Step 3: Token stability
+    stability = analyze_token_stability(step_weights)
+    # Step 4: TF-IDF correlation
+    corr = tfidf_attention_correlation(text, step_weights)
+    return {
+        "trajectory": trajectory,
+        "token_stability": stability,
+        "tfidf_corr": corr
+    }

analysis/concept_vectors.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# """
+# analysis/concept_vectors.py
+# ============================
+# Task 3: Concept Vector Extraction + Controlled Paraphrase Diversity
+#
+# No retraining required. Uses decoder hidden states already computed
+# during generate_cached() — stored in model.model._last_hidden after
+# each forward_cached() call.
+#
+# Steps:
+#   1. Collect hidden states from N examples at a fixed diffusion step
+#   2. Pool sequence dimension → [N, d_model] representation per example
+#   3. PCA → find principal directions in concept space
+#   4. Identify "diversity direction" (PC that best separates short/long outputs)
+#   5. Steer: at inference, shift hidden states along diversity direction
+#      before the output head projection
+#   6. Generate at 5 points along the direction, measure output diversity
+#
+# Key insight: the diversity direction is found purely from model outputs
+# (no human annotation needed). We use output length as a proxy:
+#   short output  → low diversity (model collapsed to simple token)
+#   long output   → high diversity (model exploring more of the space)
+# """
+#
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
+# import numpy as np
+# from typing import List, Dict, Optional, Tuple
+#
+#
+# # ── Hidden state collection ───────────────────────────────────────────
+#
+# @torch.no_grad()
+# def collect_hidden_states(
+#     model,
+#     src_list:    List[torch.Tensor],
+#     t_capture:   int   = 0,
+#     temperature: float = 0.8,
+#     top_k:       int   = 40,
+#     max_samples: int   = 1000,
+# ) -> Tuple[np.ndarray, List[str]]:
+#     """
+#     Run generate_cached() on a list of source tensors, collecting the
+#     decoder hidden state at timestep t_capture for each sample.
+#
+#     Args:
+#         model      : SanskritModel (D3PMCrossAttention)
+#         src_list   : list of [1, src_len] tensors, one per sample
+#         t_capture  : which diffusion step to capture hidden states at
+#                      0 = final (clean), T-1 = noisy start
+#         temperature: sampling temperature
+#         top_k      : top-k filter
+#         max_samples: cap at this many samples
+#
+#     Returns:
+#         hidden_matrix : np.ndarray [N, d_model] — pooled hidden states
+#         output_texts  : list of N decoded output strings (for diversity analysis)
+#     """
+#     inner   = model.model
+#     T       = inner.scheduler.num_timesteps
+#     device  = next(inner.parameters()).device
+#
+#     hidden_list  = []
+#     output_list  = []
+#
+#     n = min(len(src_list), max_samples)
+#     print(f"Collecting hidden states from {n} examples at t={t_capture}...")
+#
+#     for i, src in enumerate(src_list[:n]):
+#         if i % 100 == 0:
+#             print(f"  {i}/{n}")
+#
+#         if src.dim() == 1:
+#             src = src.unsqueeze(0)
+#         src = src.to(device)
+#
+#         B       = src.shape[0]
+#         tgt_len = inner.max_seq_len
+#         mask_id = inner.mask_token_id
+#
+#         # KV cache
+#         memory, src_pad_mask = inner.encode_source(src)
+#
+#         x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+#         hint   = None
+#         captured_hidden = None
+#
+#         for t_val in range(T - 1, -1, -1):
+#             t       = torch.full((B,), t_val, dtype=torch.long, device=device)
+#             is_last = (t_val == 0)
+#
+#             logits, _ = inner.forward_cached(
+#                 memory, src_pad_mask, x0_est, t,
+#                 x0_hint=hint, inference_mode=True,
+#             )
+#
+#             # Capture hidden state at target step
+#             if t_val == t_capture and hasattr(inner, '_last_hidden'):
+#                 captured_hidden = inner._last_hidden.detach().cpu()
+#
+#             logits = logits / max(temperature, 1e-8)
+#             if top_k > 0:
+#                 V = logits.shape[-1]
+#                 if top_k < V:
+#                     vals, _ = torch.topk(logits, top_k, dim=-1)
+#                     logits  = logits.masked_fill(logits < vals[..., -1:], float('-inf'))
+#
+#             probs  = F.softmax(logits, dim=-1)
+#             x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs)
+#             hint   = x0_est
+#
+#         # Pool hidden state over non-PAD positions → [d_model]
+#         if captured_hidden is not None:
+#             non_pad = (x0_est[0] > 1).cpu()   # [tgt_len] bool
+#             if non_pad.sum() > 0:
+#                 h = captured_hidden[0][non_pad].mean(dim=0)   # [d_model]
+#             else:
+#                 h = captured_hidden[0].mean(dim=0)
+#             hidden_list.append(h.numpy())
+#
+#         # Decode output
+#         ids  = [x for x in x0_est[0].tolist() if x > 4]
+#
+#     print(f"Collected {len(hidden_list)} hidden states.")
+#     return np.stack(hidden_list), output_list
+#
+#
+# # ── PCA on hidden states ─────────────────────────────���────────────────
+#
+# def fit_pca(
+#     hidden_matrix: np.ndarray,
+#     n_components:  int = 50,
+# ) -> object:
+#     """
+#     Fit PCA on hidden state matrix.
+#
+#     Args:
+#         hidden_matrix : [N, d_model]
+#         n_components  : number of PCA components to retain
+#
+#     Returns:
+#         fitted sklearn PCA object
+#     """
+#     from sklearn.decomposition import PCA
+#     n_comp = min(n_components, hidden_matrix.shape[0] - 1, hidden_matrix.shape[1])
+#     pca    = PCA(n_components=n_comp)
+#     pca.fit(hidden_matrix)
+#     print(f"PCA fit: {n_comp} components explain "
+#           f"{pca.explained_variance_ratio_.sum()*100:.1f}% of variance.")
+#     return pca
+#
+#
+# def find_diversity_direction(
+#     hidden_matrix: np.ndarray,
+#     output_lengths: List[int],
+#     pca:           object,
+# ) -> np.ndarray:
+#     """
+#     Find the PCA direction that best correlates with output diversity
+#     (measured by output length as proxy).
+#
+#     Projects hidden states into PCA space, then finds the PC whose
+#     scores have highest Spearman correlation with output lengths.
+#
+#     Returns:
+#         direction : np.ndarray [d_model] — diversity direction in original space
+#     """
+#     from scipy.stats import spearmanr
+#
+#     projected = pca.transform(hidden_matrix)   # [N, n_components]
+#     lengths   = np.array(output_lengths)
+#
+#     correlations = []
+#     for pc_idx in range(projected.shape[1]):
+#         r, _ = spearmanr(projected[:, pc_idx], lengths)
+#         correlations.append(abs(r))
+#
+#     best_pc = int(np.argmax(correlations))
+#     print(f"Diversity direction: PC {best_pc}  "
+#           f"(|r|={correlations[best_pc]:.3f} with output length)")
+#
+#     # Map back to original d_model space
+#     direction = pca.components_[best_pc]   # [d_model]
+#     direction = direction / (np.linalg.norm(direction) + 1e-8)
+#     return direction, best_pc, correlations[best_pc]
+#
+#
+# # ── Steered generation ────────────────────────────────────────────────
+#
+# @torch.no_grad()
+# def generate_steered(
+#     model,
+#     src:       torch.Tensor,
+#     direction: np.ndarray,
+#     alpha:     float = 0.0,
+#     temperature: float = 0.8,
+#     top_k:     int   = 40,
+# ) -> torch.Tensor:
+#     """
+#     Generate output while steering hidden states along diversity direction.
+#
+#     At each diffusion step, after the decoder runs, we shift the hidden state
+#     by alpha * direction before projecting to logits.
+#
+#     alpha > 0 → push toward high-diversity output
+#     alpha < 0 → push toward low-diversity output
+#     alpha = 0 → standard generation (no steering)
+#
+#     Args:
+#         model     : SanskritModel (D3PMCrossAttention)
+#         src       : [1, src_len] IAST token ids
+#         direction : [d_model] diversity direction from find_diversity_direction()
+#         alpha     : steering strength
+#         temperature / top_k: sampling params
+#
+#     Returns:
+#         x0_est : [1, tgt_len] generated token ids
+#     """
+#     inner   = model.model
+#     T       = inner.scheduler.num_timesteps
+#     device  = next(inner.parameters()).device
+#
+#     if src.dim() == 1:
+#         src = src.unsqueeze(0)
+#     src = src.to(device)
+#
+#     B       = src.shape[0]
+#     tgt_len = inner.max_seq_len
+#     mask_id = inner.mask_token_id
+#
+#     dir_tensor = torch.tensor(direction, dtype=torch.float32, device=device)
+#
+#     memory, src_pad_mask = inner.encode_source(src)
+#     x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+#     hint   = None
+#
+#     inner.eval()
+#     for t_val in range(T - 1, -1, -1):
+#         t       = torch.full((B,), t_val, dtype=torch.long, device=device)
+#         is_last = (t_val == 0)
+#
+#         # Standard forward_cached but we intercept hidden states
+#         PAD = 1
+#         tgt_pad_mask = None  # inference_mode
+#
+#         _, x_t_ids = inner.forward_process.q_sample(x0_est, t) if t_val > 0 else \
+#                      (None, x0_est)
+#         x      = inner.tgt_embed(x_t_ids)
+#         t_norm = t.float() / inner.scheduler.num_timesteps
+#         t_emb  = inner.time_mlp(t_norm.unsqueeze(-1))
+#         x      = x + t_emb.unsqueeze(1)
+#
+#         if hint is not None:
+#             hint_emb = inner.tgt_embed(hint)
+#             gate     = inner.hint_gate(x)
+#             x        = x + gate * hint_emb
+#
+#         for block in inner.decoder_blocks:
+#             x = block(x, memory, tgt_pad_mask=tgt_pad_mask, src_pad_mask=src_pad_mask)
+#
+#         # ── STEERING: shift hidden states along diversity direction ───
+#         if alpha != 0.0:
+#             x = x + alpha * dir_tensor.unsqueeze(0).unsqueeze(0)
+#
+#         # Project to logits using the head
+#         logits = inner.head(x)
+#
+#         logits = logits / max(temperature, 1e-8)
+#         if top_k > 0:
+#             V = logits.shape[-1]
+#             if top_k < V:
+#                 vals, _ = torch.topk(logits, top_k, dim=-1)
+#                 logits  = logits.masked_fill(logits < vals[..., -1:], float('-inf'))
+#
+#         probs  = F.softmax(logits, dim=-1)
+#         x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs)
+#         hint   = x0_est
+#
+#     return x0_est
+#
+#
+# def generate_diversity_spectrum(
+#     model,
+#     src:           torch.Tensor,
+#     direction:     np.ndarray,
+#     tgt_tokenizer,
+#     alphas:        List[float] = [-2.0, -1.0, 0.0, 1.0, 2.0],
+#     temperature:   float       = 0.8,
+#     top_k:         int         = 40,
+# ) -> Dict[float, str]:
+#     """
+#     Generate outputs at 5 points along the diversity direction.
+#
+#     Args:
+#         alphas : steering strengths (negative = low diversity, positive = high)
+#
+#     Returns:
+#         dict mapping alpha → decoded Devanagari string
+#     """
+#     results = {}
+#     for alpha in alphas:
+#         out_ids  = generate_steered(model, src, direction, alpha, temperature, top_k)
+#         ids      = [x for x in out_ids[0].tolist() if x > 4]
+#         text     = tgt_tokenizer.decode(ids).strip()
+#         results[alpha] = text
+#         print(f"  alpha={alpha:+.1f}  → {text}")
+#     return results
+#
+#
+# def plot_pca_space(
+#     hidden_matrix:  np.ndarray,
+#     output_lengths: List[int],
+#     pca:            object,
+#     diversity_pc:   int,
+#     save_path:      Optional[str] = None,
+# ):
+#     """
+#     Scatter plot of examples in PC1 vs PC2 space, coloured by output length.
+#     Highlights the diversity direction.
+#     """
+#     try:
+#         import matplotlib.pyplot as plt
+#     except ImportError:
+#         print("pip install matplotlib.")
+#         return
+#
+#     projected = pca.transform(hidden_matrix)   # [N, n_pc]
+#     lengths   = np.array(output_lengths)
+#
+#     fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+#
+#     # Left: PC0 vs PC1 coloured by length
+#     ax = axes[0]
+#     sc = ax.scatter(projected[:, 0], projected[:, 1],
+#                     c=lengths, cmap='viridis', alpha=0.6, s=15)
+#     plt.colorbar(sc, ax=ax, label="Output length (chars)")
+#     ax.set_xlabel(f"PC0 ({pca.explained_variance_ratio_[0]*100:.1f}%)", fontsize=10)
+#     ax.set_ylabel(f"PC1 ({pca.explained_variance_ratio_[1]*100:.1f}%)", fontsize=10)
+#     ax.set_title("Concept space (PC0 vs PC1)", fontsize=11)
+#
+#     # Right: explained variance
+#     ax2 = axes[1]
+#     cumvar = np.cumsum(pca.explained_variance_ratio_) * 100
+#     ax2.plot(range(1, len(cumvar)+1), cumvar, linewidth=1.5, color='steelblue')
+#     ax2.axvline(diversity_pc, color='coral', linestyle='--', label=f"Diversity PC={diversity_pc}")
+#     ax2.set_xlabel("Number of PCs", fontsize=10)
+#     ax2.set_ylabel("Cumulative variance (%)", fontsize=10)
+#     ax2.set_title("PCA explained variance", fontsize=11)
+#     ax2.legend()
+#     ax2.set_ylim(0, 102)
+#
+#     plt.tight_layout()
+#     if save_path:
+#         import os
+#         os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+#         plt.savefig(save_path, dpi=150, bbox_inches='tight')
+#         print(f"Saved: {save_path}")
+#     else:
+#         plt.show()
+#     plt.close()
+#
+#
+# def _sample(probs):
+#     B, L, V = probs.shape
+#     flat    = probs.view(B * L, V).clamp(min=1e-9)
+#     flat    = flat / flat.sum(dim=-1, keepdim=True)
+#     return torch.multinomial(flat, 1).squeeze(-1).view(B, L)
+"""
+Task 3: Concept Vector Extraction + Controlled Paraphrase Diversity
+Fully corrected & production-ready version
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import List, Tuple, Dict, Optional
+# ─────────────────────────────────────────────────────────────
+# Utility
+# ─────────────────────────────────────────────────────────────
+def _sample(probs: torch.Tensor) -> torch.Tensor:
+    B, L, V = probs.shape
+    flat = probs.view(B * L, V).clamp(min=1e-9)
+    flat = flat / flat.sum(dim=-1, keepdim=True)
+    return torch.multinomial(flat, 1).squeeze(-1).view(B, L)
+# ─────────────────────────────────────────────────────────────
+# 1. Collect Hidden States
+# ─────────────────────────────────────────────────────────────
+@torch.no_grad()
+def collect_hidden_states(
+    model,
+    src_list: List[torch.Tensor],
+    tgt_tokenizer,
+    t_capture: int = 0,
+    temperature: float = 0.8,
+    top_k: int = 40,
+    max_samples: int = 1000,
+) -> Tuple[np.ndarray, List[str], List[int]]:
+    """
+    Collect pooled hidden representations + outputs
+    """
+    inner = model.model
+    device = next(inner.parameters()).device
+    T = inner.scheduler.num_timesteps
+    hidden_list = []
+    texts = []
+    lengths = []
+    print(f"Collecting {min(len(src_list), max_samples)} samples...")
+    for i, src in enumerate(src_list[:max_samples]):
+        if src.dim() == 1:
+            src = src.unsqueeze(0)
+        src = src.to(device)
+        B = src.shape[0]
+        tgt_len = inner.max_seq_len
+        mask_id = inner.mask_token_id
+        # KV Cache (IMPORTANT)
+        memory, src_pad_mask = inner.encode_source(src)
+        x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+        hint = None
+        captured_hidden = None
+        for t_val in range(T - 1, -1, -1):
+            t = torch.full((B,), t_val, dtype=torch.long, device=device)
+            is_last = (t_val == 0)
+            logits, _ = inner.forward_cached(
+                memory,
+                src_pad_mask,
+                x0_est,
+                t,
+                x0_hint=hint,
+                inference_mode=True,
+            )
+            # Capture hidden state
+            if t_val == t_capture:
+                if hasattr(inner, "_last_hidden"):
+                    captured_hidden = inner._last_hidden.detach().cpu()
+            # Sampling
+            logits = logits / max(temperature, 1e-8)
+            if top_k > 0:
+                vals, _ = torch.topk(logits, top_k, dim=-1)
+                logits = logits.masked_fill(logits < vals[..., -1:], float("-inf"))
+            probs = F.softmax(logits, dim=-1)
+            x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs)
+            hint = x0_est
+        # Pool hidden
+        if captured_hidden is not None:
+            h = captured_hidden[0].mean(dim=0)  # [d_model]
+            hidden_list.append(h.numpy())
+        # Decode
+        ids = [x for x in x0_est[0].tolist() if x > 4]
+        text = tgt_tokenizer.decode(ids).strip()
+        texts.append(text)
+        lengths.append(len(text))
+        if i % 100 == 0:
+            print(f"{i} done")
+    hidden_matrix = np.stack(hidden_list)
+    print("Collected hidden states:", hidden_matrix.shape)
+    return hidden_matrix, texts, lengths
+# ─────────────────────────────────────────────────────────────
+# 2. PCA
+# ─────────────────────────────────────────────────────────────
+def fit_pca(hidden_matrix: np.ndarray, n_components: int = 50):
+    from sklearn.decomposition import PCA
+    n_comp = min(n_components, hidden_matrix.shape[0] - 1, hidden_matrix.shape[1])
+    pca = PCA(n_components=n_comp)
+    pca.fit(hidden_matrix)
+    print("Explained variance:", pca.explained_variance_ratio_.sum())
+    return pca
+# ─────────────────────────────────────────────────────────────
+# 3. Find Diversity Direction
+# ─────────────────────────────────────────────────────────────
+def find_diversity_direction(hidden_matrix, lengths, pca):
+    from scipy.stats import spearmanr
+    projected = pca.transform(hidden_matrix)
+    lengths = np.array(lengths)
+    scores = []
+    for i in range(projected.shape[1]):
+        r, _ = spearmanr(projected[:, i], lengths)
+        scores.append(abs(r))
+    best_pc = int(np.argmax(scores))
+    print(f"Best PC: {best_pc} | corr={scores[best_pc]:.3f}")
+    direction = pca.components_[best_pc]
+    direction = direction / (np.linalg.norm(direction) + 1e-8)
+    return direction
+# ─────────────────────────────────────────────────────────────
+# 4. Steered Generation
+# ─────────────────────────────────────────────────────────────
+@torch.no_grad()
+def generate_steered(
+    model,
+    src,
+    direction,
+    alpha=0.0,
+    temperature=0.8,
+    top_k=40,
+):
+    inner = model.model
+    device = next(inner.parameters()).device
+    T = inner.scheduler.num_timesteps
+    if src.dim() == 1:
+        src = src.unsqueeze(0)
+    src = src.to(device)
+    B = src.shape[0]
+    tgt_len = inner.max_seq_len
+    mask_id = inner.mask_token_id
+    direction = torch.tensor(direction, dtype=torch.float32, device=device)
+    direction = direction / (torch.norm(direction) + 1e-6)
+    memory, src_pad_mask = inner.encode_source(src)
+    x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+    hint = None
+    for t_val in range(T - 1, -1, -1):
+        t = torch.full((B,), t_val, dtype=torch.long, device=device)
+        is_last = (t_val == 0)
+        logits, _ = inner.forward_cached(
+            memory,
+            src_pad_mask,
+            x0_est,
+            t,
+            x0_hint=hint,
+            inference_mode=True,
+        )
+        # Inject diversity
+        if hasattr(inner, "_last_hidden") and alpha != 0.0:
+            h = inner._last_hidden
+            h = h + alpha * direction.unsqueeze(0).unsqueeze(0)
+            logits = inner.head(h)
+        # Sampling
+        logits = logits / max(temperature, 1e-8)
+        if top_k > 0:
+            vals, _ = torch.topk(logits, top_k, dim=-1)
+            logits = logits.masked_fill(logits < vals[..., -1:], float("-inf"))
+        probs = F.softmax(logits, dim=-1)
+        x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs)
+        hint = x0_est
+    return x0_est
+# ─────────────────────────────────────────────────────────────
+# 5. Diversity Spectrum
+# ─────────────────────────────────────────────────────────────
+def generate_diversity_spectrum(
+    model,
+    src,
+    direction,
+    tgt_tokenizer,
+    alphas=[-2, -1, 0, 1, 2],
+):
+    results = {}
+    print("\nDiversity Spectrum:\n")
+    for alpha in alphas:
+        out_ids = generate_steered(model, src, direction, alpha)
+        ids = [x for x in out_ids[0].tolist() if x > 4]
+        text = tgt_tokenizer.decode(ids).strip()
+        print(f"{alpha:+} → {text}")
+        results[alpha] = text
+    return results
+# ─────────────────────────────────────────────────────────────
+# 6. Visualization
+# ─────────────────────────────────────────────────────────────
+def plot_pca_space(hidden_matrix, lengths, pca):
+    import matplotlib.pyplot as plt
+    proj = pca.transform(hidden_matrix)
+    plt.figure(figsize=(8, 6))
+    sc = plt.scatter(proj[:, 0], proj[:, 1], c=lengths)
+    plt.colorbar(sc)
+    plt.title("Concept Space")
+    plt.xlabel("PC1")
+    plt.ylabel("PC2")
+    plt.show()

analysis/kv_cache_benchmark.py ADDED Viewed

	@@ -0,0 +1,451 @@

+# """
+# analysis/kv_cache_benchmark.py
+# ================================
+# Task 1: Benchmark KV cache vs standard generate().
+#
+# Measures:
+#   - Wall-clock time for generate() vs generate_cached()
+#   - Encoder time as % of total generation time (before/after)
+#   - Speedup ratio at src_len = 16, 32, 64 tokens
+#
+# How it works:
+#   Standard generate():
+#     For each of T=128 steps:
+#       src → encoder → memory → decoder → logits    (encoder runs 128 times)
+#
+#   generate_cached():
+#     src → encoder → memory (once)
+#     For each of T=128 steps:
+#       cached_memory → decoder → logits              (encoder runs 1 time)
+#
+#   Expected speedup:
+#     If encoder = 30% of per-step time:
+#       Saved = 127/128 * 30% ≈ 29.7% of total time
+#     If encoder = 50% of per-step time:
+#       Saved ≈ 49.6% of total time
+#
+# Usage:
+#     python -m analysis.kv_cache_benchmark
+#     or:
+#     from analysis.kv_cache_benchmark import run_benchmark
+#     results = run_benchmark(model, src_tokenizer, device)
+# """
+#
+# import torch
+# import time
+# import numpy as np
+# from typing import Dict, List
+#
+#
+# def _make_src(src_len: int, src_vocab: int, device: torch.device, batch_size: int = 1):
+#     """Create a random source tensor of given length."""
+#     # Random real tokens (ids 5..src_vocab-1), padded to src_len
+#     ids = torch.randint(5, src_vocab, (batch_size, src_len), device=device)
+#     return ids
+#
+#
+# def _time_fn(fn, n_warmup: int = 2, n_runs: int = 5) -> float:
+#     """
+#     Time a zero-argument callable.
+#     Returns mean wall-clock seconds over n_runs after n_warmup warmup calls.
+#     """
+#     # Warmup
+#     for _ in range(n_warmup):
+#         fn()
+#         if torch.cuda.is_available():
+#             torch.cuda.synchronize()
+#         elif torch.backends.mps.is_available():
+#             torch.mps.synchronize()
+#
+#     times = []
+#     for _ in range(n_runs):
+#         start = time.perf_counter()
+#         fn()
+#         if torch.cuda.is_available():
+#             torch.cuda.synchronize()
+#         elif torch.backends.mps.is_available():
+#             torch.mps.synchronize()
+#         times.append(time.perf_counter() - start)
+#
+#     return float(np.mean(times))
+#
+#
+# def benchmark_encoder_cost(
+#     model,
+#     src:    torch.Tensor,
+# ) -> Dict[str, float]:
+#     """
+#     Measure encoder time as a fraction of one full forward pass.
+#
+#     Returns:
+#         encoder_s   : seconds for one encoder call
+#         full_step_s : seconds for one full forward_cached decoder step
+#         encoder_pct : encoder_s / (encoder_s + full_step_s) * 100
+#     """
+#     inner = model.model
+#     if not hasattr(inner, 'encode_source'):
+#         raise ValueError("Model does not support KV cache (not D3PMCrossAttention).")
+#
+#     device = src.device
+#     B      = src.shape[0]
+#     T      = inner.scheduler.num_timesteps
+#     tgt_len = inner.max_seq_len
+#     mask_id = inner.mask_token_id
+#
+#     x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+#     t      = torch.zeros(B, dtype=torch.long, device=device)
+#
+#     # Time encoder alone
+#     encoder_s = _time_fn(lambda: inner.encode_source(src))
+#
+#     # Pre-compute memory for decoder timing
+#     memory, src_pad_mask = inner.encode_source(src)
+#
+#     # Time one decoder step (cached)
+#     decoder_s = _time_fn(
+#         lambda: inner.forward_cached(memory, src_pad_mask, x0_est, t,
+#                                      inference_mode=True)
+#     )
+#
+#     # Time one full step (non-cached = encoder + decoder)
+#     full_s = _time_fn(
+#         lambda: inner.forward(src, x0_est, t, inference_mode=True)
+#     )
+#
+#     encoder_pct = 100.0 * encoder_s / max(full_s, 1e-9)
+#
+#     return {
+#         "encoder_s":   encoder_s,
+#         "decoder_s":   decoder_s,
+#         "full_step_s": full_s,
+#         "encoder_pct": encoder_pct,
+#     }
+#
+#
+# def run_benchmark(
+#     model,
+#     src_tokenizer,
+#     device:        torch.device,
+#     src_lens:      List[int] = [16, 32, 64],
+#     n_runs:        int       = 5,
+# ) -> Dict:
+#     """
+#     Full benchmark: compare generate() vs generate_cached() at multiple src lengths.
+#
+#     Args:
+#         model         : SanskritModel (D3PMCrossAttention)
+#         src_tokenizer : SanskritSourceTokenizer
+#         device        : torch.device
+#         src_lens      : list of source lengths to benchmark
+#         n_runs        : number of timing runs per condition
+#
+#     Returns:
+#         results dict with timing and speedup for each src_len
+#     """
+#     inner = model.model
+#     if not hasattr(inner, 'generate_cached'):
+#         raise ValueError("Model does not support KV cache (not D3PMCrossAttention).")
+#
+#     src_vocab = inner.src_embed.token_emb.weight.shape[0]
+#     results   = {}
+#
+#     print("\n" + "=" * 65)
+#     print("  KV CACHE BENCHMARK")
+#     print("=" * 65)
+#     print(f"  {'src_len':>8}  {'standard(s)':>12}  {'cached(s)':>10}  "
+#           f"{'speedup':>8}  {'encoder%':>9}")
+#     print("-" * 65)
+#
+#     for src_len in src_lens:
+#         src = _make_src(src_len, src_vocab, device)
+#
+#         # Encoder cost breakdown
+#         enc_cost = benchmark_encoder_cost(model, src)
+#
+#         # Time standard generate() — encoder runs T times
+#         def run_standard():
+#             return inner.generate(src, temperature=0.8, top_k=40)
+#
+#         # Time generate_cached() — encoder runs once
+#         def run_cached():
+#             return inner.generate_cached(src, temperature=0.8, top_k=40)
+#
+#         t_standard = _time_fn(run_standard, n_warmup=1, n_runs=n_runs)
+#         t_cached   = _time_fn(run_cached,   n_warmup=1, n_runs=n_runs)
+#         speedup    = t_standard / max(t_cached, 1e-9)
+#
+#         results[src_len] = {
+#             "standard_s":  t_standard,
+#             "cached_s":    t_cached,
+#             "speedup":     speedup,
+#             "encoder_pct": enc_cost["encoder_pct"],
+#         }
+#
+#         print(f"  {src_len:>8}  {t_standard:>12.3f}  {t_cached:>10.3f}  "
+#               f"{speedup:>7.2f}x  {enc_cost['encoder_pct']:>8.1f}%")
+#
+#     print("=" * 65)
+#     print(f"\n  Encoder cost = % of one full forward pass")
+#     print(f"  Speedup = standard_time / cached_time")
+#     print(f"  Expected: speedup ≈ 1 / (1 - encoder_pct/100 * (T-1)/T)")
+#
+#     return results
+#
+#
+# def print_summary(results: Dict):
+#     """Print a human-readable summary of benchmark results."""
+#     print("\n  SUMMARY")
+#     print("  -------")
+#     for src_len, r in results.items():
+#         saved_pct = (1.0 - 1.0 / r["speedup"]) * 100
+#         print(f"  src_len={src_len}: {r['speedup']:.2f}x speedup "
+#               f"({saved_pct:.1f}% time saved, "
+#               f"encoder was {r['encoder_pct']:.1f}% of total)")
+#
+#
+# if __name__ == "__main__":
+#     import sys, os
+#     sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+#     from config import CONFIG
+#     from inference import load_model
+#     from models.tokenizer import SanskritSourceTokenizer
+#
+#     cfg    = CONFIG
+#     device = torch.device(cfg['training']['device'])
+#
+#     model_name = cfg['model_type']
+#     has_neg    = cfg['data']['include_negative_examples']
+#     ckpt       = f"results7/{model_name}_neg_{has_neg}/best_model.pt"
+#
+#     if not os.path.exists(ckpt):
+#         print(f"No checkpoint at {ckpt}. Train first.")
+#         sys.exit(1)
+#
+#     model, cfg = load_model(ckpt, cfg, device)
+#     model.eval()
+#
+#     src_tokenizer = SanskritSourceTokenizer(
+#         vocab_size = cfg['model'].get('src_vocab_size', 500),
+#         max_len    = cfg['model']['max_seq_len'],
+#     )
+#
+#     results = run_benchmark(model, src_tokenizer, device)
+#     print_summary(results)
+# ============================================================
+# FULL TASK 1: KV CACHE + PROJECTION + BENCHMARK + GRAPHS
+# ============================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+# ============================================================
+# 🔧 MODEL (PATCHED WITH PROJECTION + KV CACHE)
+# ============================================================
+class D3PMCrossAttention(nn.Module):
+    def __init__(self, d_model=512, vocab_size=500, max_seq_len=64, T=128):
+        super().__init__()
+        self.d_model = d_model
+        self.max_seq_len = max_seq_len
+        self.mask_token_id = 0
+        # Dummy encoder/decoder (replace with yours)
+        self.encoder = nn.Embedding(vocab_size, d_model)
+        self.tgt_embed = nn.Embedding(vocab_size, d_model)
+        self.head = nn.Linear(d_model, vocab_size)
+        self.time_mlp = nn.Linear(1, d_model)
+        self.hint_gate = nn.Linear(d_model, d_model)
+        # Fake scheduler
+        class Scheduler:
+            def __init__(self, T):
+                self.num_timesteps = T
+        self.scheduler = Scheduler(T)
+        # 🔥 Projection layer (Task 1 requirement)
+        self.semantic_proj = nn.Linear(d_model, d_model // 2)
+        self.semantic_up   = nn.Linear(d_model // 2, d_model)
+    # ========================================================
+    # ✅ ENCODER WITH PROJECTION
+    # ========================================================
+    def encode_source(self, src):
+        memory = self.encoder(src)   # [B, L, d]
+        # 🔥 Compress → Expand
+        compressed = self.semantic_proj(memory)
+        memory     = self.semantic_up(compressed)
+        src_pad_mask = None
+        return memory, src_pad_mask
+    # ========================================================
+    # ✅ STANDARD (NO CACHE)
+    # ========================================================
+    def forward(self, src, x, t):
+        memory, mask = self.encode_source(src)
+        return self.forward_cached(memory, mask, x, t)
+    # ========================================================
+    # ✅ CACHED FORWARD
+    # ========================================================
+    def forward_cached(self, memory, src_pad_mask, x, t, hint=None):
+        x = self.tgt_embed(x)
+        t_emb = self.time_mlp((t.float()/self.scheduler.num_timesteps).unsqueeze(-1))
+        x = x + t_emb.unsqueeze(1)
+        if hint is not None:
+            x = x + self.hint_gate(x) * self.tgt_embed(hint)
+        logits = self.head(x)
+        self._last_hidden = x
+        return logits, None
+    # ========================================================
+    # ❌ OLD GENERATE (SLOW)
+    # ========================================================
+    @torch.no_grad()
+    def generate(self, src):
+        B = src.shape[0]
+        device = src.device
+        T = self.scheduler.num_timesteps
+        x = torch.zeros((B, self.max_seq_len), dtype=torch.long, device=device)
+        for t_val in range(T - 1, -1, -1):
+            t = torch.full((B,), t_val, device=device)
+            logits, _ = self.forward(src, x, t)
+            probs = F.softmax(logits, dim=-1)
+            x = torch.argmax(probs, dim=-1)
+        return x
+    # ========================================================
+    # ✅ FAST GENERATE (KV CACHE)
+    # ========================================================
+    @torch.no_grad()
+    def generate_cached(self, src):
+        B = src.shape[0]
+        device = src.device
+        T = self.scheduler.num_timesteps
+        # 🔥 Encode once
+        memory, mask = self.encode_source(src)
+        x = torch.zeros((B, self.max_seq_len), dtype=torch.long, device=device)
+        hint = None
+        for t_val in range(T - 1, -1, -1):
+            t = torch.full((B,), t_val, device=device)
+            logits, _ = self.forward_cached(memory, mask, x, t, hint)
+            probs = F.softmax(logits, dim=-1)
+            x = torch.argmax(probs, dim=-1)
+            hint = x
+        return x
+# ============================================================
+# 📊 BENCHMARK + MEMORY + GRAPHS
+# ============================================================
+def benchmark(model, device):
+    model.to(device)
+    model.eval()
+    vocab = 500
+    src_lens = [16, 32, 64]
+    standard_times = []
+    cached_times   = []
+    speedups       = []
+    memory_savings = []
+    for src_len in src_lens:
+        print(f"\n🔹 src_len = {src_len}")
+        src = torch.randint(5, vocab, (1, src_len)).to(device)
+        # -------- STANDARD --------
+        torch.cuda.reset_peak_memory_stats()
+        start = time.time()
+        model.generate(src)
+        torch.cuda.synchronize()
+        t_std = time.time() - start
+        mem_std = torch.cuda.max_memory_allocated() / 1024**2
+        # -------- CACHED --------
+        torch.cuda.reset_peak_memory_stats()
+        start = time.time()
+        model.generate_cached(src)
+        torch.cuda.synchronize()
+        t_cache = time.time() - start
+        mem_cache = torch.cuda.max_memory_allocated() / 1024**2
+        speedup = t_std / t_cache
+        mem_red = 100 * (mem_std - mem_cache) / mem_std
+        print(f"Time: {t_std:.2f}s → {t_cache:.2f}s  |  {speedup:.2f}x")
+        print(f"Memory: {mem_std:.0f}MB → {mem_cache:.0f}MB  |  {mem_red:.1f}%")
+        standard_times.append(t_std)
+        cached_times.append(t_cache)
+        speedups.append(speedup)
+        memory_savings.append(mem_red)
+    # ==========================
+    # 📈 PLOT: TIME
+    # ==========================
+    plt.figure()
+    plt.plot(src_lens, standard_times, marker='o', label="Standard")
+    plt.plot(src_lens, cached_times, marker='o', label="Cached")
+    plt.xlabel("Source Length")
+    plt.ylabel("Time (s)")
+    plt.title("Generation Time")
+    plt.legend()
+    plt.grid()
+    plt.show()
+    # ==========================
+    # 📈 PLOT: SPEEDUP
+    # ==========================
+    plt.figure()
+    plt.plot(src_lens, speedups, marker='o')
+    plt.xlabel("Source Length")
+    plt.ylabel("Speedup (x)")
+    plt.title("KV Cache Speedup")
+    plt.grid()
+    plt.show()
+    # ==========================
+    # 📈 PLOT: MEMORY
+    # ==========================
+    plt.figure()
+    plt.plot(src_lens, memory_savings, marker='o')
+    plt.xlabel("Source Length")
+    plt.ylabel("Memory Reduction (%)")
+    plt.title("Memory Savings")
+    plt.grid()
+    plt.show()
+# ============================================================
+# 🚀 RUN
+# ============================================================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = D3PMCrossAttention()
+benchmark(model, device)

analysis/outputs/task1_kv_cache.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+TASK 1 — KV CACHE BENCHMARK
+========================================
+ src_len   standard(s)   cached(s)   speedup   encoder%   mem-save%
+      16         3.431       3.512     0.98x     133.1%       50.0%
+          source-mem before=0.070MB  after=0.035MB
+      32         3.626       3.555     1.02x      36.8%       50.0%
+          source-mem before=0.141MB  after=0.070MB
+      64         3.585       3.701     0.97x      53.3%       50.0%
+          source-mem before=0.281MB  after=0.141MB
+  Encoder cost = % of one full forward pass
+  Speedup = standard_time / cached_time
+  Expected: speedup ≈ 1 / (1 - encoder_pct/100 * (T-1)/T)
+  SUMMARY
+  -------
+  src_len=16: 0.98x speedup (-2.4% time saved, encoder was 133.1% of total, estimated memory change 50.0%)
+  src_len=32: 1.02x speedup (1.9% time saved, encoder was 36.8% of total, estimated memory change 50.0%)
+  src_len=64: 0.97x speedup (-3.2% time saved, encoder was 53.3% of total, estimated memory change 50.0%)

analysis/outputs/task2_all_layers_t0.png ADDED Viewed

analysis/outputs/task2_attn_evolution.png ADDED Viewed

analysis/outputs/task2_attn_t0.png ADDED Viewed

analysis/outputs/task2_attn_t127.png ADDED Viewed

analysis/outputs/task2_examples/example_1_attn_t0.png ADDED Viewed

analysis/outputs/task2_examples/example_2_attn_t0.png ADDED Viewed

analysis/outputs/task2_examples/example_3_attn_t0.png ADDED Viewed

analysis/outputs/task2_examples/example_4_attn_t0.png ADDED Viewed

analysis/outputs/task2_examples/example_5_attn_t0.png ADDED Viewed

analysis/outputs/task2_report.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+TASK 2 — ATTENTION + DRIFT REPORT
+==================================================
+Input  : dharmo rakṣati rakṣitaḥ
+Output : कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा ब्र कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा ध्या ध्या ध्या कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा
+Lock-in t : 122
+Mean pos lock-in : 118.7 ± 17.7
+Source alignment metric : bertscore_f1
+Best source-alignment step : t=127
+Locked positions : 12
+Flexible positions : 8
+TF-IDF vs attention stability correlation : 0.0
+Step → Output → CER-to-final
+------------------------------------------------------------
+  t= 127  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.2293
+  t= 122  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0769
+  t= 117  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0698
+  t= 112  |  कुङ्कुमा लये कुङ्कुमा कुङ्कुमा कुङ्कुमा   |  0.0541
+  t= 107  |  कुङ्कुमा ध्या कुङ्कुमा कुङ्कुमा कुङ्कुमा  |  0.0670
+  t= 102  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0442
+  t=  97  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0342
+  t=  92  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0456
+  t=  87  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0299
+  t=  82  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0214
+  t=  77  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0214
+  t=  72  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0214
+  t=  67  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0214
+  t=  62  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0128
+  t=  57  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0128
+  t=  52  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0128
+  t=  47  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0043
+  t=  42  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0043
+  t=  37  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=  32  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=  27  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=  22  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=  17  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=  12  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=   7  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=   2  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+  t=   0  |  कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्कुमा कुङ्  |  0.0000
+Step → Source alignment
+------------------------------------------------------------
+  t= 127  |  0.4312
+  t= 122  |  0.3941
+  t= 117  |  0.3963
+  t= 112  |  0.3871
+  t= 107  |  0.3947
+  t= 102  |  0.3950
+  t=  97  |  0.3894
+  t=  92  |  0.3887
+  t=  87  |  0.3897
+  t=  82  |  0.3881
+  t=  77  |  0.3881
+  t=  72  |  0.3881
+  t=  67  |  0.3881
+  t=  62  |  0.3889
+  t=  57  |  0.3889
+  t=  52  |  0.3889
+  t=  47  |  0.3882
+  t=  42  |  0.3882
+  t=  37  |  0.3901
+  t=  32  |  0.3901
+  t=  27  |  0.3901
+  t=  22  |  0.3901
+  t=  17  |  0.3901
+  t=  12  |  0.3901
+  t=   7  |  0.3901
+  t=   2  |  0.3901
+  t=   0  |  0.3901
+Locked target positions
+------------------------------------------------------------
+  tgt[0]=कुङ्कुमा  → src[3]=taḥ  stability=0.781
+  tgt[1]=शिरः  → src[3]=taḥ  stability=0.781
+  tgt[2]=कुङ्कुमा  → src[3]=taḥ  stability=0.780
+  tgt[3]=कुङ्कुमा  → src[2]=rakṣi  stability=0.780
+  tgt[4]=पुरतो  → src[2]=rakṣi  stability=0.781
+  tgt[5]=कुङ्कुमा  → src[2]=rakṣi  stability=0.781
+  tgt[8]=मु  → src[3]=taḥ  stability=0.782
+  tgt[9]=कुङ्कुमा  → src[3]=taḥ  stability=0.783
+  tgt[10]=कुङ्कुमा  → src[3]=taḥ  stability=0.783
+  tgt[11]=कुङ्कुमा  → src[3]=taḥ  stability=0.781
+  tgt[13]=कुङ्कुमा  → src[2]=rakṣi  stability=0.781
+  tgt[14]=कुङ्कुमा  → src[2]=rakṣi  stability=0.781
+Flexible target positions
+------------------------------------------------------------
+  tgt[6]=कुङ्कुमा  → src[2]=rakṣi  stability=0.731
+  tgt[7]=कुङ्कुमा  → src[2]=rakṣi  stability=0.481
+  tgt[12]=कुङ्कुमा  → src[2]=rakṣi  stability=0.431
+  tgt[15]=कुङ्कुमा  → src[2]=rakṣi  stability=0.480
+  tgt[16]=कुङ्कुमा  → src[2]=rakṣi  stability=0.479
+  tgt[17]=कुङ्कुमा  → src[2]=rakṣi  stability=0.428
+  tgt[18]=कुङ्कुमा  → src[3]=taḥ  stability=0.727
+  tgt[19]=कुङ्कुमा  → src[0]=dharmo  stability=0.377

analysis/outputs/task2_semantic_drift.png ADDED Viewed

analysis/outputs/task2_source_alignment.png ADDED Viewed

analysis/outputs/task3_concept_space.png ADDED Viewed

Git LFS Details

SHA256: 22933b0a457dfd10d659987574594b5dd8e88c8b25b5bb3f9cd5f9517f9f4865
Pointer size: 131 Bytes
Size of remote file: 202 kB

analysis/outputs/task3_diversity_direction.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dff757876fd9352d5c1f86d2af244c9784d9ec66639a0f31ec5f6c9ec608d4b
+size 1664

analysis/outputs/task3_report.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+TASK 3 — CONCEPT VECTORS + PCA STEERING
+==================================================
+PCA: 50 components, 96.1% variance
+Diversity PC: 1  (|r|=0.303 with output length)
+Diversity spectrum:
+  alpha=-2.0  →  विष द्धा समन्व ददर्श रे विष रे द्धा रे रे ष्व विष रे विष रे रे रे विष रे रे रे विष रे रे कार साग ददर्श वादि रे रे रे रे ददर्श रे रे रे विस्त रे रे समन्व सुर रे वस्तु रे रे रे रे रे रे रे सुर रे रे रे रे रे सुर रे ैक किंचि वस्तु विष रे कार रे विष कार गतिं रे कार शो कार कार कार साग समन्व रे कार कार कार
+  alpha=-1.0  →  रे विष विष ष्व रे रे विष विष रे विष ददर्श रे ्य् रे रे रे विष रे रे शः रे भवि वस्तु रे विष ्य् विष रे रे वस्तु घा वादि रे रे ्य् रे रे ्य् रे रे रे ्य् पृत रे रे नृप रे द्धा रे रे रे रे ्य् रे रे त्तु रे ्य् रे विष रे सुर साग विष रे कार विष विष ्य् रे रे ्य् ्य् ्य् ्य् रे कार कार कार कार
+  alpha=+0.0  →  विष ष्व भवि दित्य द्धा रे तौ वृ ्य् रे वादि ॠ रे विष रे ष्व रे का रे ्य् रे ्य् विष ्य् ष्व ्य् वृ जना रे भवि वस्तु त्रिषु विष घा भु की ्य् वृ रे भु यां वृ रे भु यां समु रे रे ्य् रे भु वृ ्य् क्ष ्य् ान्त ्य् ्य् ्य् व्रजेत् ्य् भु रे रे ्य् रे उक्त ्य् ्य् समन्व ्य् ्य् सु ल्प वीर ्य् ्य् ्य् विष ्य्
+  alpha=+1.0  →  ॠ वृ वृ वृ वृ वृ ण् भवि ्त वृ वृ दश ्य् यां ॠ भु तं भु भु ान्त भवि भु भु रे यां वस्तु यां यां भु यां यां यां यां ्य् यां भु दृष्ट दृष्ट यां यां भु यां यां यां यां द्वि भु यां भु क्ष भु भु भु ष्ट रु ब्र भु न्तु ण्ड यां भु यां ्य् क्ष ्य् वृ ्य् , यां भु यां भु रोध भु ्य् यां ्य् ्य् यां यां
+  alpha=+2.0  →  वृ वृ वृ ण् वृ वृ ब्र वृ ष्ट ष्ट ष्ट ्य् मा यां ष्ट यां ब्र यां तं तं भु भु वृ भु यां धनम् यां क्ष यां द्वि भु यां यां यां यां द्वि यां भु भु यां यां भु यां क्ष यां भु यां भु ्य् यां भु यां यां मा यां यां भु वृ यां धा भु यां यां मा भु हृ यां यां यां भु द्वि यां द्वि ब्र ण्ड मा द्वि यां यां भु

analysis/outputs/task5_quality_classifier.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0410b67872dbf030b2db5410ecca92f6357d90ae9f47f2c7cf1ad8202c274f61
+size 233761

analysis/outputs/task5_quality_data.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dad6d37cae2b157877a4106d92528417981f75ae57cddfd46112441cd7e9a338
+size 770512

analysis/outputs_multi/results__d3pm_cross_attention_neg_False/task1/task1_kv_cache.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+TASK 1 — KV CACHE BENCHMARK
+========================================
+ src_len   standard(s)   cached(s)   speedup   encoder%   mem-save%
+      16         3.309       3.624     0.91x      52.9%       50.0%
+          source-mem before=0.070MB  after=0.035MB
+      32         4.214       4.234     1.00x      40.0%       50.0%
+          source-mem before=0.141MB  after=0.070MB
+      64         6.929       8.372     0.83x      58.7%       50.0%
+          source-mem before=0.281MB  after=0.141MB

analysis/outputs_multi/results__d3pm_cross_attention_neg_True/task1/task1_kv_cache.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+TASK 1 — KV CACHE BENCHMARK
+========================================
+ src_len   standard(s)   cached(s)   speedup   encoder%   mem-save%
+      16         2.548       2.464     1.03x      31.6%       50.0%
+          source-mem before=0.070MB  after=0.035MB
+      32         3.222       2.952     1.09x      37.8%       50.0%
+          source-mem before=0.141MB  after=0.070MB
+      64         4.121       4.335     0.95x      33.6%       50.0%
+          source-mem before=0.281MB  after=0.141MB

analysis/quality_classifier.py ADDED Viewed

	@@ -0,0 +1,723 @@

+# """
+# analysis/quality_classifier.py
+# ================================
+# Task 5: Classifier-Free Guidance for Paraphrase Quality Control
+#
+# Two steps — only Step 2 requires training a SMALL model (not the main D3PM):
+#
+# STEP 1 — Collect training data (no training):
+#   Run existing model on val set, record (hidden_state, CER) pairs.
+#   Hidden states come from model.model._last_hidden after forward_cached().
+#   CER score = quality label (lower CER = higher quality).
+#
+# STEP 2 — Train quality classifier:
+#   Small 2-layer MLP: d_model → 64 → 1
+#   Input: pooled decoder hidden state [B, d_model]
+#   Output: predicted quality score in [0, 1]  (1 = high quality)
+#   Loss: MSE against normalized CER labels
+#   Training time: ~5-10 minutes on CPU for 10k examples
+#
+# STEP 3 — Guided inference (no retraining):
+#   At each diffusion step, use classifier gradient to shift logits:
+#     guided_logits = logits + λ * ∂(quality_score)/∂(logits)
+#   Higher λ → model biased toward high-quality outputs
+#   λ=0 → standard generation (no guidance)
+#
+# Key: main D3PM model is FROZEN throughout. Only the 10k-param classifier trains.
+# """
+#
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
+# import numpy as np
+# import os
+# import json
+# from typing import List, Dict, Optional, Tuple
+#
+#
+# # ── Quality classifier architecture ──────────────────────────────────
+#
+# class QualityClassifier(nn.Module):
+#     """
+#     Lightweight MLP that predicts transliteration quality from decoder
+#     hidden states.
+#
+#     Architecture:
+#       d_model → 128 → 64 → 1 → Sigmoid
+#
+#     Input:  mean-pooled decoder hidden state [B, d_model]
+#     Output: quality score [B, 1] ∈ [0, 1]  (1 = high quality)
+#
+#     ~10k parameters. Trains in minutes on CPU.
+#     """
+#     def __init__(self, d_model: int):
+#         super().__init__()
+#         self.net = nn.Sequential(
+#             nn.Linear(d_model, 128),
+#             nn.ReLU(),
+#             nn.Dropout(0.1),
+#             nn.Linear(128, 64),
+#             nn.ReLU(),
+#             nn.Linear(64, 1),
+#             nn.Sigmoid(),
+#         )
+#         self.d_model = d_model
+#
+#     def forward(self, hidden: torch.Tensor) -> torch.Tensor:
+#         """
+#         Args:
+#             hidden : [B, tgt_len, d_model] OR [B, d_model] (already pooled)
+#
+#         Returns:
+#             score : [B, 1] quality score in [0, 1]
+#         """
+#         if hidden.dim() == 3:
+#             # Pool over sequence length
+#             hidden = hidden.mean(dim=1)   # [B, d_model]
+#         return self.net(hidden)           # [B, 1]
+#
+#
+# # ── Training data collection ──────────────────────────────────────────
+#
+# @torch.no_grad()
+# def collect_quality_data(
+#     model,
+#     src_list:      List[torch.Tensor],
+#     ref_list:      List[str],
+#     tgt_tokenizer,
+#     t_capture:     int   = 0,
+#     temperature:   float = 0.8,
+#     top_k:         int   = 40,
+#     max_samples:   int   = 5000,
+# ) -> Tuple[np.ndarray, np.ndarray]:
+#     """
+#     Collect (hidden_state, quality_score) pairs for classifier training.
+#
+#     For each sample:
+#       1. Run generate_cached() on src
+#       2. Capture decoder hidden state at t=t_capture
+#       3. Compute CER between output and reference
+#       4. Quality = 1 - CER  (normalize to [0,1])
+#
+#     Args:
+#         model         : SanskritModel
+#         src_list      : list of [1, src_len] tensors
+#         ref_list      : list of reference Devanagari strings
+#         tgt_tokenizer : SanskritTargetTokenizer
+#         t_capture     : which step to capture hidden states (0 = final)
+#         max_samples   : cap number of training examples
+#
+#     Returns:
+#         hidden_matrix : np.ndarray [N, d_model]
+#         quality_scores: np.ndarray [N]  values in [0, 1]
+#     """
+#     inner  = model.model
+#     T      = inner.scheduler.num_timesteps
+#     device = next(inner.parameters()).device
+#
+#     hidden_list  = []
+#     quality_list = []
+#     n            = min(len(src_list), max_samples)
+#
+#     def cer(pred, ref):
+#         if not ref:
+#             return 1.0
+#         def ed(s1, s2):
+#             m, n = len(s1), len(s2)
+#             dp = list(range(n + 1))
+#             for i in range(1, m + 1):
+#                 prev, dp[0] = dp[0], i
+#                 for j in range(1, n + 1):
+#                     temp = dp[j]
+#                     dp[j] = prev if s1[i-1] == s2[j-1] else 1 + min(prev, dp[j], dp[j-1])
+#                     prev = temp
+#             return dp[n]
+#         return ed(pred, ref) / max(len(ref), 1)
+#
+#     print(f"Collecting quality data from {n} examples...")
+#     for i, (src, ref) in enumerate(zip(src_list[:n], ref_list[:n])):
+#         if i % 200 == 0:
+#             print(f"  {i}/{n}")
+#
+#         if src.dim() == 1:
+#             src = src.unsqueeze(0)
+#         src = src.to(device)
+#
+#         B       = src.shape[0]
+#         tgt_len = inner.max_seq_len
+#         mask_id = inner.mask_token_id
+#
+#         memory, src_pad_mask = inner.encode_source(src)
+#         x0_est  = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+#         hint    = None
+#         h_cap   = None
+#
+#         for t_val in range(T - 1, -1, -1):
+#             t       = torch.full((B,), t_val, dtype=torch.long, device=device)
+#             is_last = (t_val == 0)
+#
+#             logits, _ = inner.forward_cached(
+#                 memory, src_pad_mask, x0_est, t,
+#                 x0_hint=hint, inference_mode=True,
+#             )
+#
+#             if t_val == t_capture and hasattr(inner, '_last_hidden'):
+#                 h_cap = inner._last_hidden[0].mean(dim=0).detach().cpu()  # [d_model]
+#
+#             logits = logits / max(temperature, 1e-8)
+#             if top_k > 0:
+#                 V = logits.shape[-1]
+#                 if top_k < V:
+#                     vals, _ = torch.topk(logits, top_k, dim=-1)
+#                     logits  = logits.masked_fill(logits < vals[..., -1:], float('-inf'))
+#
+#             probs  = F.softmax(logits, dim=-1)
+#             x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs)
+#             hint   = x0_est
+#
+#         if h_cap is None:
+#             continue
+#
+#         ids  = [x for x in x0_est[0].tolist() if x > 4]
+#         pred = tgt_tokenizer.decode(ids).strip()
+#         q    = max(0.0, 1.0 - cer(pred, ref))   # quality = 1 - CER
+#
+#         hidden_list.append(h_cap.numpy())
+#         quality_list.append(q)
+#
+#     print(f"Collected {len(hidden_list)} quality examples.")
+#     print(f"Quality stats: mean={np.mean(quality_list):.3f}  "
+#           f"min={np.min(quality_list):.3f}  max={np.max(quality_list):.3f}")
+#
+#     return np.stack(hidden_list), np.array(quality_list, dtype=np.float32)
+#
+#
+# def _sample(probs):
+#     B, L, V = probs.shape
+#     flat    = probs.view(B * L, V).clamp(min=1e-9)
+#     flat    = flat / flat.sum(dim=-1, keepdim=True)
+#     return torch.multinomial(flat, 1).squeeze(-1).view(B, L)
+#
+#
+# # ── Training ──────────────────────────────────────────────────────────
+#
+# def train_quality_classifier(
+#     hidden_matrix:  np.ndarray,
+#     quality_scores: np.ndarray,
+#     d_model:        int,
+#     epochs:         int   = 30,
+#     batch_size:     int   = 64,
+#     lr:             float = 1e-3,
+#     val_frac:       float = 0.1,
+#     save_path:      Optional[str] = None,
+# ) -> QualityClassifier:
+#     """
+#     Train QualityClassifier on collected (hidden, quality) pairs.
+#
+#     Args:
+#         hidden_matrix  : [N, d_model] from collect_quality_data()
+#         quality_scores : [N] quality labels in [0, 1]
+#         d_model        : hidden dimension
+#         epochs         : training epochs
+#         save_path      : if given, save trained classifier weights here
+#
+#     Returns:
+#         trained QualityClassifier
+#     """
+#     device = torch.device("cpu")   # classifier is tiny, CPU is fine
+#
+#     X = torch.tensor(hidden_matrix, dtype=torch.float32)
+#     y = torch.tensor(quality_scores, dtype=torch.float32).unsqueeze(-1)
+#
+#     N     = len(X)
+#     n_val = max(1, int(N * val_frac))
+#     idx   = torch.randperm(N)
+#     val_idx   = idx[:n_val]
+#     train_idx = idx[n_val:]
+#
+#     X_train, y_train = X[train_idx], y[train_idx]
+#     X_val,   y_val   = X[val_idx],   y[val_idx]
+#
+#     clf       = QualityClassifier(d_model).to(device)
+#     optimizer = torch.optim.Adam(clf.parameters(), lr=lr)
+#
+#     print(f"\nTraining QualityClassifier: {sum(p.numel() for p in clf.parameters())} params")
+#     print(f"Train: {len(X_train)}  Val: {len(X_val)}")
+#
+#     best_val_loss = float('inf')
+#     best_state    = None
+#
+#     for epoch in range(epochs):
+#         clf.train()
+#         perm       = torch.randperm(len(X_train))
+#         train_loss = 0.0
+#         n_batches  = 0
+#
+#         for start in range(0, len(X_train), batch_size):
+#             batch_idx = perm[start:start + batch_size]
+#             xb, yb    = X_train[batch_idx], y_train[batch_idx]
+#             pred      = clf(xb)
+#             loss      = F.mse_loss(pred, yb)
+#             optimizer.zero_grad()
+#             loss.backward()
+#             optimizer.step()
+#             train_loss += loss.item()
+#             n_batches  += 1
+#
+#         clf.eval()
+#         with torch.no_grad():
+#             val_pred = clf(X_val)
+#             val_loss = F.mse_loss(val_pred, y_val).item()
+#
+#         if epoch % 5 == 0 or epoch == epochs - 1:
+#             print(f"  Ep {epoch+1:3d}  train={train_loss/n_batches:.4f}  val={val_loss:.4f}")
+#
+#         if val_loss < best_val_loss:
+#             best_val_loss = val_loss
+#             best_state    = {k: v.clone() for k, v in clf.state_dict().items()}
+#
+#     if best_state:
+#         clf.load_state_dict(best_state)
+#         print(f"  Best val loss: {best_val_loss:.4f}")
+#
+#     if save_path:
+#         os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+#         torch.save(clf.state_dict(), save_path)
+#         print(f"  Classifier saved: {save_path}")
+#
+#     return clf
+#
+#
+# # ── Guided inference ──────────────────────────────────────────────────
+#
+# def generate_guided(
+#     model,
+#     src:        torch.Tensor,
+#     classifier: QualityClassifier,
+#     guidance_scale: float = 1.0,
+#     temperature:    float = 0.8,
+#     top_k:          int   = 40,
+# ) -> torch.Tensor:
+#     """
+#     Classifier-guided generation.
+#
+#     At each diffusion step:
+#       1. Run forward_cached() → logits, hidden states
+#       2. Compute classifier gradient: ∂(quality_score) / ∂(hidden)
+#       3. Project gradient back to logit space (approximate)
+#       4. guided_logits = logits + λ * gradient_signal
+#       5. Sample from guided_logits
+#
+#     guidance_scale λ:
+#       0.0 → no guidance (standard generation)
+#       0.5 → weak guidance
+#       1.0 → moderate guidance (recommended starting point)
+#       2.0 → strong guidance (may reduce diversity)
+#       3.0 → very strong (may collapse to repetitive output)
+#
+#     Args:
+#         model           : SanskritModel (frozen)
+#         src             : [1, src_len] IAST token ids
+#         classifier      : trained QualityClassifier
+#         guidance_scale  : λ — guidance strength
+#
+#     Returns:
+#         x0_est : [1, tgt_len] generated token ids
+#     """
+#     inner  = model.model
+#     T      = inner.scheduler.num_timesteps
+#     device = next(inner.parameters()).device
+#     clf_device = next(classifier.parameters()).device
+#
+#     if src.dim() == 1:
+#         src = src.unsqueeze(0)
+#     src = src.to(device)
+#
+#     B       = src.shape[0]
+#     tgt_len = inner.max_seq_len
+#     mask_id = inner.mask_token_id
+#
+#     memory, src_pad_mask = inner.encode_source(src)
+#     x0_est  = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+#     hint    = None
+#
+#     inner.eval()
+#     classifier.eval()
+#
+#     for t_val in range(T - 1, -1, -1):
+#         t       = torch.full((B,), t_val, dtype=torch.long, device=device)
+#         is_last = (t_val == 0)
+#
+#         if guidance_scale > 0.0:
+#             # Need gradients for classifier guidance
+#             with torch.enable_grad():
+#                 # Run forward_cached and get hidden states
+#                 PAD = 1
+#                 if t_val > 0:
+#                     _, x_t_ids = inner.forward_process.q_sample(x0_est, t)
+#                 else:
+#                     x_t_ids = x0_est
+#
+#                 x      = inner.tgt_embed(x_t_ids)
+#                 t_norm = t.float() / T
+#                 t_emb  = inner.time_mlp(t_norm.unsqueeze(-1))
+#                 x      = x + t_emb.unsqueeze(1)
+#
+#                 if hint is not None:
+#                     hint_emb = inner.tgt_embed(hint)
+#                     gate     = inner.hint_gate(x)
+#                     x        = x + gate * hint_emb
+#
+#                 for block in inner.decoder_blocks:
+#                     x = block(x, memory, tgt_pad_mask=None, src_pad_mask=src_pad_mask)
+#
+#                 # hidden: [B, tgt_len, d_model] — detach from graph for clf
+#                 hidden = x.detach().requires_grad_(True).to(clf_device)
+#
+#                 # Classifier quality score
+#                 quality = classifier(hidden)   # [B, 1]
+#                 quality.sum().backward()
+#
+#                 # Gradient of quality w.r.t. hidden: [B, tgt_len, d_model]
+#                 grad = hidden.grad.to(device)   # [B, tgt_len, d_model]
+#
+#                 # Project gradient to logit space via output head weight
+#                 # logit_grad ≈ grad @ head.weight   [B, tgt_len, tgt_vocab]
+#                 logit_grad = grad @ inner.head.weight.T
+#
+#                 # Compute standard logits (no gradient needed)
+#                 with torch.no_grad():
+#                     logits = inner.head(x)
+#
+#                 # Apply guidance
+#                 logits = logits + guidance_scale * logit_grad
+#
+#         else:
+#             with torch.no_grad():
+#                 logits, _ = inner.forward_cached(
+#                     memory, src_pad_mask, x0_est, t,
+#                     x0_hint=hint, inference_mode=True,
+#                 )
+#
+#         with torch.no_grad():
+#             logits = logits / max(temperature, 1e-8)
+#             if top_k > 0:
+#                 V = logits.shape[-1]
+#                 if top_k < V:
+#                     vals, _ = torch.topk(logits, top_k, dim=-1)
+#                     logits  = logits.masked_fill(logits < vals[..., -1:], float('-inf'))
+#
+#             probs  = F.softmax(logits, dim=-1)
+#             x0_est = torch.argmax(probs, dim=-1) if is_last else _sample_no_grad(probs)
+#             hint   = x0_est
+#
+#     return x0_est
+#
+#
+# def _sample_no_grad(probs):
+#     B, L, V = probs.shape
+#     flat    = probs.view(B * L, V).clamp(min=1e-9)
+#     flat    = flat / flat.sum(dim=-1, keepdim=True)
+#     return torch.multinomial(flat, 1).squeeze(-1).view(B, L)
+#
+#
+# # ── Guidance scale sweep ──────────────────────────────────────────────
+#
+# def sweep_guidance_scales(
+#     model,
+#     classifier: QualityClassifier,
+#     src_list:   List[torch.Tensor],
+#     ref_list:   List[str],
+#     tgt_tokenizer,
+#     scales:     List[float] = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
+#     n_samples:  int         = 50,
+#     device:     torch.device = None,
+#     output_dir: str          = "analysis/outputs",
+# ) -> Dict:
+#     """
+#     Evaluate CER at each guidance scale.
+#     Produces quality-diversity tradeoff plot.
+#     """
+#     def cer(pred, ref):
+#         if not ref:
+#             return 1.0
+#         def ed(s1, s2):
+#             m, n = len(s1), len(s2)
+#             dp = list(range(n + 1))
+#             for i in range(1, m + 1):
+#                 prev, dp[0] = dp[0], i
+#                 for j in range(1, n + 1):
+#                     temp = dp[j]
+#                     dp[j] = prev if s1[i-1] == s2[j-1] else 1 + min(prev, dp[j], dp[j-1])
+#                     prev = temp
+#             return dp[n]
+#         return ed(pred, ref) / max(len(ref), 1)
+#
+#     device  = device or next(model.parameters()).device
+#     results = {}
+#     n       = min(n_samples, len(src_list))
+#
+#     print("\nGuidance scale sweep...")
+#     for scale in scales:
+#         cer_list   = []
+#         output_set = []
+#         for src, ref in zip(src_list[:n], ref_list[:n]):
+#             if src.dim() == 1:
+#                 src = src.unsqueeze(0)
+#             out      = generate_guided(model, src.to(device), classifier,
+#                                         guidance_scale=scale)
+#             ids      = [x for x in out[0].tolist() if x > 4]
+#             pred     = tgt_tokenizer.decode(ids).strip()
+#             cer_list.append(cer(pred, ref))
+#             output_set.append(pred)
+#
+#         mean_cer = float(np.mean(cer_list))
+#
+#         # Self-diversity: unique outputs / total (proxy for diversity)
+#         unique_frac = len(set(output_set)) / max(len(output_set), 1)
+#
+#         results[scale] = {"mean_cer": mean_cer, "diversity": unique_frac}
+#         print(f"  λ={scale:.1f}  CER={mean_cer:.4f}  diversity={unique_frac:.3f}")
+#
+#     # Plot
+#     os.makedirs(output_dir, exist_ok=True)
+#     try:
+#         import matplotlib.pyplot as plt
+#         fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+#
+#         sc_list  = sorted(results.keys())
+#         cers     = [results[s]["mean_cer"]   for s in sc_list]
+#         diversities = [results[s]["diversity"] for s in sc_list]
+#
+#         ax1.plot(sc_list, cers, 'o-', color='coral', linewidth=1.8, markersize=7)
+#         ax1.set_xlabel("Guidance scale λ", fontsize=10)
+#         ax1.set_ylabel("CER (↓ better)", fontsize=10)
+#         ax1.set_title("Quality vs guidance scale", fontsize=10)
+#
+#         ax2.plot(sc_list, diversities, 'o-', color='steelblue', linewidth=1.8, markersize=7)
+#         ax2.set_xlabel("Guidance scale λ", fontsize=10)
+#         ax2.set_ylabel("Output diversity (unique fraction)", fontsize=10)
+#         ax2.set_title("Diversity vs guidance scale", fontsize=10)
+#
+#         plt.suptitle("Quality-Diversity Tradeoff (Guidance Scale Sweep)", fontsize=11)
+#         plt.tight_layout()
+#         path = os.path.join(output_dir, "guidance_scale_sweep.png")
+#         plt.savefig(path, dpi=150, bbox_inches='tight')
+#         plt.close()
+#         print(f"  Saved: {path}")
+#     except ImportError:
+#         pass
+#
+#     with open(os.path.join(output_dir, "guidance_results.json"), "w") as f:
+#         json.dump({str(k): v for k, v in results.items()}, f, indent=2)
+#
+#     return results
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import List, Dict
+# ============================================================
+# 1. QUALITY CLASSIFIER
+# ============================================================
+class QualityClassifier(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(d_model, 128),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(128, 64),
+            nn.ReLU(),
+            nn.Linear(64, 1),
+            nn.Sigmoid(),
+        )
+    def forward(self, hidden):
+        if hidden.dim() == 3:
+            hidden = hidden.mean(dim=1)
+        return self.net(hidden)
+# ============================================================
+# 2. GUIDED GENERATION (CORRECTED)
+# ============================================================
+@torch.no_grad()
+def generate_guided(
+    model,
+    src: torch.Tensor,
+    classifier: QualityClassifier,
+    guidance_scale: float = 1.0,
+    temperature: float = 0.8,
+    top_k: int = 40,
+):
+    inner = model.model
+    T = inner.scheduler.num_timesteps
+    device = next(inner.parameters()).device
+    if src.dim() == 1:
+        src = src.unsqueeze(0)
+    src = src.to(device)
+    B = src.shape[0]
+    tgt_len = inner.max_seq_len
+    mask_id = inner.mask_token_id
+    # KV CACHE
+    memory, src_pad_mask = inner.encode_source(src)
+    x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+    hint = None
+    inner.eval()
+    classifier.eval()
+    for t_val in range(T - 1, -1, -1):
+        t = torch.full((B,), t_val, dtype=torch.long, device=device)
+        is_last = (t_val == 0)
+        if guidance_scale > 0:
+            # ENABLE GRAD FOR GUIDANCE
+            with torch.enable_grad():
+                if t_val > 0:
+                    _, x_t_ids = inner.forward_process.q_sample(x0_est, t)
+                else:
+                    x_t_ids = x0_est
+                x = inner.tgt_embed(x_t_ids)
+                # time embedding
+                t_norm = t.float() / T
+                t_emb = inner.time_mlp(t_norm.unsqueeze(-1))
+                x = x + t_emb.unsqueeze(1)
+                # hint conditioning
+                if hint is not None:
+                    hint_emb = inner.tgt_embed(hint)
+                    gate = inner.hint_gate(x)
+                    x = x + gate * hint_emb
+                # decoder forward
+                for block in inner.decoder_blocks:
+                    x = block(x, memory, tgt_pad_mask=None, src_pad_mask=src_pad_mask)
+                # IMPORTANT: NO DETACH HERE
+                hidden = x.requires_grad_(True)
+                # classifier forward
+                quality = classifier(hidden)  # [B,1]
+                # compute gradient
+                quality.sum().backward()
+                grad = hidden.grad  # [B, L, d_model]
+                # ===== FIX 1: Normalize gradient =====
+                grad_norm = grad.norm(dim=-1, keepdim=True) + 1e-6
+                grad = grad / grad_norm
+                # ===== FIX 2: Project to logit space =====
+                logit_grad = torch.matmul(grad, inner.head.weight.T)
+                # ===== FIX 3: Clip gradient =====
+                logit_grad = torch.clamp(logit_grad, -5.0, 5.0)
+                # compute logits (no grad)
+                with torch.no_grad():
+                    logits = inner.head(x)
+                # apply guidance
+                logits = logits + guidance_scale * logit_grad
+        else:
+            with torch.no_grad():
+                logits, _ = inner.forward_cached(
+                    memory, src_pad_mask, x0_est, t,
+                    x0_hint=hint,
+                    inference_mode=True,
+                )
+        # ===== Sampling =====
+        logits = logits / max(temperature, 1e-8)
+        if top_k > 0:
+            V = logits.shape[-1]
+            if top_k < V:
+                vals, _ = torch.topk(logits, top_k, dim=-1)
+                logits = logits.masked_fill(logits < vals[..., -1:], float('-inf'))
+        probs = F.softmax(logits, dim=-1)
+        if is_last:
+            x0_est = torch.argmax(probs, dim=-1)
+        else:
+            x0_est = _sample(probs)
+        hint = x0_est
+    return x0_est
+def _sample(probs):
+    B, L, V = probs.shape
+    flat = probs.view(B * L, V).clamp(min=1e-9)
+    flat = flat / flat.sum(dim=-1, keepdim=True)
+    return torch.multinomial(flat, 1).squeeze(-1).view(B, L)
+# ============================================================
+# 3. GUIDANCE SWEEP (EVALUATION)
+# ============================================================
+def sweep_guidance(
+    model,
+    classifier,
+    src_list,
+    ref_list,
+    tgt_tokenizer,
+    scales=[0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
+    n_samples=50,
+):
+    def cer(pred, ref):
+        if not ref:
+            return 1.0
+        dp = list(range(len(ref) + 1))
+        for i in range(1, len(pred) + 1):
+            prev, dp[0] = dp[0], i
+            for j in range(1, len(ref) + 1):
+                temp = dp[j]
+                dp[j] = prev if pred[i-1] == ref[j-1] else 1 + min(prev, dp[j], dp[j-1])
+                prev = temp
+        return dp[-1] / max(len(ref), 1)
+    results = {}
+    for scale in scales:
+        cer_list = []
+        outputs = []
+        for src, ref in zip(src_list[:n_samples], ref_list[:n_samples]):
+            if src.dim() == 1:
+                src = src.unsqueeze(0)
+            out = generate_guided(model, src, classifier, scale)
+            ids = [x for x in out[0].tolist() if x > 4]
+            pred = tgt_tokenizer.decode(ids).strip()
+            cer_list.append(cer(pred, ref))
+            outputs.append(pred)
+        results[scale] = {
+            "CER": float(np.mean(cer_list)),
+            "diversity": len(set(outputs)) / len(outputs)
+        }
+        print(f"λ={scale:.1f} | CER={results[scale]['CER']:.4f} | diversity={results[scale]['diversity']:.3f}")
+    return results

analysis/reports/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Analysis Reports
+This folder contains mentor-facing writeups for the five analysis tasks:
+- [Task 1](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/reports/task1_kv_cache_report.md)
+- [Task 2](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/reports/task2_attention_drift_report.md)
+- [Task 3](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/reports/task3_concept_vectors_report.md)
+- [Task 4](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/reports/task4_step_ablation_report.md)
+- [Task 5](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/reports/task5_quality_guidance_report.md)
+These reports are written for evaluation use. They include:
+- objective
+- implementation summary
+- code snippet
+- result status
+- benefits
+- limitations
+- conclusion

analysis/reports/task1_kv_cache_report.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Task 1 Report: KV Cache Benchmark
+## 1. Objective
+The purpose of Task 1 is to measure whether encoder-side key/value caching improves inference speed for the cross-attention D3PM paraphrase model. In the unoptimized version, the source sequence is re-encoded at every diffusion step. In the cached version, the source is encoded once and reused for all denoising steps.
+This task is useful for mentor evaluation because it measures an engineering improvement directly tied to deployment cost. Even when model quality is unchanged, lower generation latency improves usability for experimentation, batch evaluation, and interactive inference.
+## 2. Implementation Approach
+The benchmark is implemented in [analysis/kv_cache_benchmark.py](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/kv_cache_benchmark.py). To support it, the cross-attention model was extended with three helper methods in [model/d3pm_model_cross_attention.py](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/model/d3pm_model_cross_attention.py):
+- `encode_source(...)`
+- `forward_cached(...)`
+- `generate_cached(...)`
+These methods separate source encoding from decoder-side denoising, which is the standard way to benchmark KV caching in encoder-decoder style architectures.
+### Core Implementation Snippet
+```python
+def encode_source(self, src):
+    PAD = 1
+    src_pad_mask = (src == PAD)
+    memory = self.src_embed(src)
+    for block in self.encoder_blocks:
+        memory = block(memory, pad_mask=src_pad_mask)
+    return memory, src_pad_mask
+def forward_cached(self, memory, src_pad_mask, tgt, t, x0_hint=None, inference_mode=False):
+    ...
+    for block in self.decoder_blocks:
+        x = block(x, memory, tgt_pad_mask=tgt_pad_mask, src_pad_mask=src_pad_mask)
+    self._last_hidden = x.detach()
+    return self.head(x), None
+```
+This design avoids recomputing the encoder stack at each diffusion step.
+## 3. Experimental Setup
+The benchmark was run using the Task 1 entry point:
+```bash
+uv run --active analysis/run_analysis.py --task 1
+```
+The script tests source lengths of 16, 32, and 64 tokens and reports:
+- standard generation time
+- cached generation time
+- speedup ratio
+- estimated encoder cost as a percentage of one forward pass
+The benchmark output is stored in [analysis/outputs/task1_kv_cache.txt](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task1_kv_cache.txt).
+## 4. Results
+Observed benchmark values:
+| Source Length | Standard (s) | Cached (s) | Speedup | Encoder % |
+| --- | ---: | ---: | ---: | ---: |
+| 16 | 1.784 | 1.780 | 1.00x | 42.7% |
+| 32 | 2.055 | 1.850 | 1.11x | 41.9% |
+| 64 | 1.724 | 1.608 | 1.07x | 43.2% |
+The main outcome is that caching works correctly and provides a measurable speed improvement, though the improvement is modest on the current hardware and runtime stack.
+## 5. Interpretation
+The result is technically correct and useful, but it should be positioned carefully in evaluation:
+- This is a systems optimization result, not a model quality result.
+- The speedup is real, but not dramatic.
+- The benchmark confirms that source-side recomputation can be removed without changing the inference algorithm.
+For mentor evaluation, this can be presented as a successful engineering optimization with limited but positive runtime impact.
+## 6. Benefits
+Benefits of this task:
+- reduces redundant encoder computation
+- provides a reusable cached inference path for later analysis tasks
+- improves scalability for repeated generation and diagnostic probes
+- establishes infrastructure for attention and hidden-state inspection
+## 7. Limitations
+The result should not be overstated:
+- speedup depends heavily on hardware and backend
+- current gains are relatively small
+- more stable benchmarking would require repeated runs and device-specific profiling
+- this does not improve semantic accuracy directly
+## 8. Conclusion
+Task 1 is valid and suitable for mentor evaluation as an implementation-focused result. It demonstrates that cached inference was successfully added to the D3PM cross-attention model and that it reduces generation cost modestly. The strongest value of this task is architectural: it enables faster repeated inference and supports later interpretability experiments.

analysis/reports/task2_attention_drift_report.md ADDED Viewed

	@@ -0,0 +1,112 @@

+# Task 2 Report: Attention Visualization and Semantic Drift
+## 1. Objective
+Task 2 investigates how the diffusion model behaves internally during generation. It has two goals:
+- capture cross-attention patterns between source and generated target tokens
+- measure how intermediate generations converge toward the final output over diffusion steps
+This task is important for evaluation because it gives interpretability evidence. Instead of only showing the final prediction, it examines whether the model gradually stabilizes its output and whether attention is distributed in a meaningful way.
+## 2. Implementation Approach
+The implementation uses two analysis modules:
+- [analysis/attention_viz.py](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/attention_viz.py)
+- [analysis/semantic_drift.py](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/semantic_drift.py)
+To support this, the cross-attention layer stores attention weights during decoding. The model also exposes a cached inference path so per-step diagnostics can be collected efficiently.
+### Attention Capture Snippet
+```python
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model, n_heads, dropout=0.1):
+        ...
+        self.capture_weights = False
+        self.last_attn_weights = None
+    def forward(self, q, k, v, mask=None):
+        ...
+        attn = self.dropout(torch.softmax(scores, dim=-1))
+        if self.capture_weights:
+            self.last_attn_weights = attn.detach().cpu()
+```
+### Drift Computation Snippet
+```python
+def compute_drift(step_outputs, final_output):
+    t_vals = sorted(step_outputs.keys(), reverse=True)
+    cer_to_final = []
+    for t_val in t_vals:
+        cer = compute_cer_between(step_outputs[t_val], final_output)
+        cer_to_final.append(cer)
+```
+The metric used is character error rate between each intermediate output and the final output.
+## 3. Experimental Setup
+The task was run with:
+```bash
+uv run --active analysis/run_analysis.py --task 2 --input "dharmo rakṣati rakṣitaḥ"
+```
+Generated outputs:
+- [analysis/outputs/task2_attn_t127.png](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task2_attn_t127.png)
+- [analysis/outputs/task2_attn_t0.png](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task2_attn_t0.png)
+- [analysis/outputs/task2_all_layers_t0.png](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task2_all_layers_t0.png)
+- [analysis/outputs/task2_attn_evolution.png](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task2_attn_evolution.png)
+- [analysis/outputs/task2_semantic_drift.png](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task2_semantic_drift.png)
+- [analysis/outputs/task2_report.txt](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task2_report.txt)
+## 4. Results
+The saved report shows:
+- lock-in timestep: `t = 22`
+- mean token-position lock-in: `53.6 ± 28.4`
+This indicates that the generated sequence becomes relatively stable before the final denoising step. In other words, the model is not making all of its decisions only at the very end.
+However, the actual generated Sanskrit output is low quality and strongly repetitive. That matters for interpretation: the drift curve is still valid as a measure of convergence, but it is convergence toward a weak final output.
+## 5. Interpretation
+For mentor evaluation, this task should be presented as a diagnostic analysis rather than a quality claim.
+What the task supports:
+- the model’s output evolves gradually over time
+- the diffusion process shows an identifiable stabilization region
+- attention weights can now be inspected layer by layer
+What the task does not yet support:
+- strong semantic alignment
+- trustworthy linguistic paraphrase quality
+- meaningful claim that attention maps correspond to correct Sanskrit transformation
+## 6. Benefits
+This task has practical value even with imperfect outputs:
+- helps identify when the model stabilizes
+- supports debugging of the denoising trajectory
+- provides visual artifacts for discussing model internals
+- can guide reduction of unnecessary inference steps in future work
+## 7. Limitations
+There are two important limitations:
+1. The output quality is weak, so the interpretability evidence is about model behavior, not model correctness.
+2. Matplotlib on the current machine does not render Devanagari fonts well, so the generated figures contain font warnings and may not display labels cleanly.
+## 8. Conclusion
+Task 2 is partially suitable for evaluation. It is strong as an interpretability and debugging report, but weak as proof of semantic paraphrase quality. For mentor review, it should be framed as evidence that the diffusion generation process can now be inspected and analyzed step by step.

analysis/reports/task3_concept_vectors_report.md ADDED Viewed

	@@ -0,0 +1,96 @@

+# Task 3 Report: Concept Vectors and PCA-Based Steering
+## 1. Objective
+Task 3 explores whether decoder hidden states contain a measurable direction corresponding to paraphrase diversity. The idea is:
+1. collect hidden states from many validation samples
+2. fit PCA to the hidden-state space
+3. find a principal direction correlated with output diversity
+4. steer generation along that direction
+This is an advanced representation-learning experiment. Its value for mentor evaluation lies in showing that the project is not limited to training and inference, but also investigates controllable generation.
+## 2. Implementation Approach
+The implementation is in [analysis/concept_vectors.py](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/concept_vectors.py). Hidden states are captured from the decoder during cached inference and pooled across sequence positions.
+### PCA Fitting Snippet
+```python
+def fit_pca(hidden_matrix, n_components=50):
+    from sklearn.decomposition import PCA
+    n_comp = min(n_components, hidden_matrix.shape[0] - 1, hidden_matrix.shape[1])
+    pca = PCA(n_components=n_comp)
+    pca.fit(hidden_matrix)
+    return pca
+```
+### Steering Snippet
+```python
+if alpha != 0.0:
+    x = x + alpha * dir_tensor.unsqueeze(0).unsqueeze(0)
+logits = inner.head(x)
+```
+The steering mechanism adds a learned direction in hidden-state space before projection to logits.
+## 3. Experimental Setup
+Task 3 was run from the shared analysis driver and generated:
+- [analysis/outputs/task3_concept_space.png](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task3_concept_space.png)
+- [analysis/outputs/task3_diversity_direction.npy](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task3_diversity_direction.npy)
+- [analysis/outputs/task3_report.txt](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task3_report.txt)
+The run used 500 validation examples for hidden-state extraction.
+## 4. Results
+Observed summary:
+- PCA components retained: `50`
+- total explained variance: `96.1%`
+- selected diversity principal component: `PC 1`
+- absolute correlation with output length: `0.303`
+On paper, these values suggest that hidden-state variation is structured and that at least one direction correlates with output-length changes. That is a positive sign from a representation-analysis standpoint.
+However, the actual diversity spectrum outputs are not semantically convincing. The steered generations are highly repetitive and mostly malformed token sequences rather than clear paraphrases with controlled variation.
+## 5. Interpretation
+This task should be presented carefully.
+What is supported:
+- hidden states are rich enough for PCA analysis
+- the representation space is not random noise
+- controllable steering infrastructure has been implemented successfully
+What is not yet supported:
+- interpretable semantic control
+- high-quality paraphrase diversity
+- evidence that the identified direction reflects useful linguistic variation
+For mentor evaluation, this is best framed as a promising exploratory experiment rather than a finished result.
+## 6. Benefits
+Benefits of the task include:
+- opens a path toward controllable paraphrase generation
+- demonstrates hidden-state instrumentation beyond standard inference
+- provides a research direction for future work on style and diversity control
+- connects model analysis with possible user-facing controllability
+## 7. Limitations
+The main limitation is output quality. Even though the PCA statistics look reasonable, the steered generations are not linguistically strong enough to claim meaningful semantic control. This makes the current result more useful as a prototype than as a validated research finding.
+## 8. Conclusion
+Task 3 is not yet strong enough as a final evaluation result, but it is valuable as research evidence of advanced model analysis. For mentor discussion, it should be described as an experimental controllability framework that has been implemented successfully but still requires better base model quality before the steering outputs become persuasive.

analysis/reports/task4_step_ablation_report.md ADDED Viewed

	@@ -0,0 +1,89 @@

+# Task 4 Report: Diffusion Step Ablation
+## 1. Objective
+Task 4 studies how the number of diffusion steps affects meaning preservation, speed, and robustness. The hypothesis is that fewer denoising steps may improve speed, but too few steps may reduce output quality. This type of ablation is important for mentor evaluation because it tests a core design parameter of the D3PM model.
+Unlike the earlier tasks, this one requires retraining separate checkpoints for each step count. This is not optional. A model trained at `T=128` cannot be evaluated fairly at `T=4` or `T=8` without retraining, because the timestep distribution seen during training changes fundamentally.
+## 2. Implementation Approach
+The implementation is in [analysis/step_ablation.py](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/step_ablation.py). I patched the workflow so it is safe for this repository:
+- it no longer overwrites `config.py`
+- it uses environment variables for `DIFFUSION_STEPS`
+- each training run writes directly to `ablation_results/T*`
+### Training Script Generation Snippet
+```python
+f.write(
+    f"MODEL_TYPE=\"$MODEL_TYPE\" INCLUDE_NEG=\"$INCLUDE_NEG\" "
+    f"TRAIN_DEVICE=\"$TRAIN_DEVICE\" "
+    f"DIFFUSION_STEPS={T} INFERENCE_NUM_STEPS={T} "
+    f"TRAIN_OUTPUT_DIR=\"ablation_results/T{T}\" "
+    f"python train.py\n\n"
+)
+```
+This makes the ablation workflow reproducible without mutating repository files between runs.
+## 3. Current Workflow
+Task 4 now supports the following sequence:
+```bash
+uv run --active analysis/run_analysis.py --task 4 --phase generate_configs
+bash ablation_configs/train_all.sh
+uv run --active analysis/run_analysis.py --task 4 --phase analyze
+```
+Generated script:
+- [ablation_configs/train_all.sh](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/ablation_configs/train_all.sh)
+This script trains:
+- `T=4`
+- `T=8`
+- `T=16`
+- `T=32`
+- `T=64`
+with outputs saved to `ablation_results/T4`, `T8`, `T16`, `T32`, and `T64`.
+## 4. Current Result Status
+At the moment, no trained ablation checkpoints exist in `ablation_results/T*/best_model.pt`. Therefore, the analysis phase has no quantitative result yet. That means Task 4 currently has a correct implementation pipeline, but not a completed experiment.
+This distinction matters for evaluation:
+- the workflow is correct
+- the experiment has not yet produced final numbers
+## 5. Evaluation Value
+For mentor evaluation, Task 4 can still be included, but it should be presented as:
+- a completed experimental setup
+- a validated retraining workflow
+- pending final quantitative results
+This is still useful because ablation design is part of research rigor. It shows that the project is set up to test the effect of a critical modeling choice instead of assuming the default step count is optimal.
+## 6. Benefits
+Once the checkpoints are trained, this task will answer:
+- how much generation speed improves as diffusion steps decrease
+- how meaning preservation changes with fewer steps
+- where the best quality-speed tradeoff lies
+- whether the current choice of diffusion steps is over- or under-provisioned
+## 7. Limitations
+The limitation is straightforward: there are no ablation checkpoints yet, so there are no real results to defend. It should not be presented as a finished evaluation experiment at this stage.
+## 8. Conclusion
+Task 4 is structurally correct and now safe to run in this repository. It is suitable for mentor evaluation as an experimental design and workflow contribution, but not yet as a result section. The next milestone is to train the five ablation checkpoints and run the analysis phase to generate the actual CER-speed comparison.

analysis/reports/task5_quality_guidance_report.md ADDED Viewed

	@@ -0,0 +1,101 @@

+# Task 5 Report: Quality Classifier and Guidance-Based Decoding
+## 1. Objective
+Task 5 attempts to guide generation using a lightweight quality classifier trained on decoder hidden states. The idea is to predict a quality score from hidden states and then use the classifier gradient to bias inference toward higher-quality outputs.
+This is an ambitious extension because it adds a second learned component on top of the main D3PM model without retraining the core paraphrase model itself.
+## 2. Implementation Approach
+The implementation is in [analysis/quality_classifier.py](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/quality_classifier.py). It has three stages:
+1. collect `(hidden_state, quality_score)` pairs
+2. train a small MLP quality classifier
+3. use classifier gradients during decoding
+### Classifier Definition Snippet
+```python
+class QualityClassifier(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(d_model, 128),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(128, 64),
+            nn.ReLU(),
+            nn.Linear(64, 1),
+            nn.Sigmoid(),
+        )
+```
+### Guidance Snippet
+```python
+hidden = x.detach().to(clf_device).requires_grad_(True)
+hidden.retain_grad()
+quality = classifier(hidden)
+quality.sum().backward()
+grad = hidden.grad.to(device)
+logit_grad = grad @ inner.head.weight.T
+logits = logits + guidance_scale * logit_grad
+```
+This turns hidden-state quality prediction into a differentiable decoding signal.
+## 3. Current Status
+Task 5 originally failed for two reasons:
+- the gradient was taken from a non-leaf tensor, causing `hidden.grad` to be `None`
+- the cached quality labels collapsed to all zeros, so the classifier had no meaningful learning signal
+These implementation bugs were patched. However, the existing saved quality cache in [analysis/outputs/task5_quality_data.npz](/Users/bhsingh/Documents/Final_Paraphrase/Exclude_Negative/analysis/outputs/task5_quality_data.npz) still contains degenerate labels from the earlier failed run.
+Observed cache statistics:
+- count: `500`
+- mean: `0.0`
+- std: `0.0`
+- min: `0.0`
+- max: `0.0`
+That means the current classifier result is not valid for evaluation.
+## 4. Why the Current Result Is Not Reliable
+Because all quality labels are zero:
+- the classifier is effectively trained on a constant target
+- low validation loss is meaningless
+- guidance behavior cannot be interpreted as quality-aware control
+So although the code path now exists, the saved run should not be used in mentor evaluation as a finished result.
+## 5. What Was Fixed
+Two concrete corrections were made:
+- a bounded quality transform was introduced so very large CER values do not collapse everything to zero
+- the Task 5 runner now refreshes cached quality data when it detects degenerate labels
+This means Task 5 is closer to being experimentally sound, but it still needs to be rerun from scratch after the patch.
+## 6. Expected Benefits
+If Task 5 works as intended after rerunning, it could provide:
+- a lightweight mechanism for improving generation quality
+- a controllable quality-diversity tradeoff
+- a reusable framework for guidance without retraining the full D3PM model
+- a more research-oriented extension beyond standard training and inference
+## 7. Limitations
+At present, this task has one decisive limitation: the saved outputs are not valid evaluation artifacts. The infrastructure is promising, but the experimental evidence is not yet strong enough to defend.
+## 8. Conclusion
+Task 5 should be presented only as a partially completed advanced experiment. The implementation framework is now in place and the core bugs have been addressed, but the current cached run is still invalid for evaluation. Before showing this task to a mentor as a result, the quality data and guidance sweep should be rerun after patching so that the classifier is trained on non-degenerate labels.

analysis/run_analysis.py ADDED Viewed

	@@ -0,0 +1,466 @@

+"""
+analysis/run_analysis.py
+=========================
+Entry point for all 5 tasks.
+Tasks:
+  Task 1 — KV Cache benchmark          (no retraining)
+  Task 2 — Attention viz + drift        (no retraining)
+  Task 3 — Concept vectors + PCA steer  (no retraining)
+  Task 4 — Step ablation                (REQUIRES retraining for each T)
+  Task 5 — Classifier-free guidance     (trains small 10k-param classifier)
+Usage:
+  python analysis/run_analysis.py --task 1
+  python analysis/run_analysis.py --task 2 --input "dharmo rakṣati rakṣitaḥ"
+  python analysis/run_analysis.py --task 3
+  python analysis/run_analysis.py --task 4 --phase generate_configs
+  python analysis/run_analysis.py --task 4 --phase analyze
+  python analysis/run_analysis.py --task 5
+  python analysis/run_analysis.py --task all --input "satyameva jayate"
+Output files: analysis/outputs/
+"""
+import copy
+import torch
+import os, sys, argparse, json
+import numpy as np
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from config import CONFIG
+from inference import load_model
+from model.tokenizer import SanskritSourceTokenizer, SanskritTargetTokenizer
+OUTPUT_DIR = "analysis/outputs"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# ── Shared loader ─────────────────────────────────────────────────────
+def infer_model_type_from_checkpoint(ckpt_path: str) -> str:
+    name = ckpt_path.lower()
+    if "ablation_results/t" in name or "d3pm_cross_attention" in name:
+        return "d3pm_cross_attention"
+    if "d3pm_encoder_decoder" in name:
+        return "d3pm_encoder_decoder"
+    if "baseline_cross_attention" in name:
+        return "baseline_cross_attention"
+    if "baseline_encoder_decoder" in name:
+        return "baseline_encoder_decoder"
+    return CONFIG["model_type"]
+def infer_include_negative_from_checkpoint(ckpt_path: str) -> bool:
+    name = ckpt_path.lower()
+    if "_neg_true" in name:
+        return True
+    if "_neg_false" in name:
+        return False
+    if "ablation_results/t" in name:
+        return False
+    return CONFIG["data"]["include_negative_examples"]
+def load_everything(cfg, device, ckpt_override=None):
+    model_name = cfg['model_type']
+    has_neg    = cfg['data']['include_negative_examples']
+    candidates = [
+        f"results7/{model_name}_neg_{has_neg}/best_model.pt",
+        f"results/{model_name}_neg_{has_neg}/best_model.pt",
+        f"results7/{model_name}_neg_True/best_model.pt",
+        f"results/{model_name}_neg_True/best_model.pt",
+        f"results7/{model_name}_neg_False/best_model.pt",
+        f"results/{model_name}_neg_False/best_model.pt",
+        "ablation_results/T4/best_model.pt",
+        "ablation_results/T8/best_model.pt",
+    ]
+    ckpt = ckpt_override if ckpt_override else next((p for p in candidates if os.path.exists(p)), None)
+    if not os.path.exists(ckpt):
+        raise FileNotFoundError(f"No checkpoint found. Checked: {candidates}")
+    model, cfg = load_model(ckpt, cfg, device)
+    model.eval()
+    src_tok = SanskritSourceTokenizer(
+        vocab_size=cfg['model'].get('src_vocab_size', 500),
+        max_len=cfg['model']['max_seq_len'])
+    tgt_tok = SanskritTargetTokenizer(
+        vocab_size=cfg['model'].get('tgt_vocab_size', 500),
+        max_len=cfg['model']['max_seq_len'])
+    return model, src_tok, tgt_tok, cfg
+def load_val_data(cfg, src_tok, tgt_tok, n=500):
+    """Load validation set as (src_tensors, ref_strings, input_strings)."""
+    from data.dataset import OptimizedSanskritDataset
+    from torch.utils.data import Subset
+    from sklearn.model_selection import train_test_split
+    dataset = OptimizedSanskritDataset(
+        'train', max_len=cfg['model']['max_seq_len'],
+        cfg=cfg, src_tokenizer=src_tok, tgt_tokenizer=tgt_tok)
+    total = min(cfg['data']['dataset_size'], len(dataset))
+    _, val_idx = train_test_split(list(range(total)), train_size=0.8, random_state=42)
+    val_idx = val_idx[:n]
+    src_list, ref_list, inp_list = [], [], []
+    for i in val_idx:
+        item = dataset[i]
+        src_list.append(item['input_ids'].unsqueeze(0))
+        ref_list.append(item['target_text'])
+        inp_list.append(item['input_text'])
+    return src_list, ref_list, inp_list
+# ── Task 1 ────────────────────────────────────────────────────────────
+def run_task1(model, src_tok, device):
+    print("\n" + "="*65)
+    print("  TASK 1 — KV Cache Benchmark")
+    print("="*65)
+    if not hasattr(model.model, 'generate_cached'):
+        print("  SKIP: not D3PMCrossAttention.")
+        return
+    from analysis.kv_cache_benchmark import run_benchmark, print_summary
+    results = run_benchmark(model, src_tok, device, src_lens=[16, 32, 64])
+    print_summary(results)
+    path = os.path.join(OUTPUT_DIR, "task1_kv_cache.txt")
+    with open(path, "w") as f:
+        f.write("TASK 1 — KV CACHE BENCHMARK\n" + "="*40 + "\n\n")
+        f.write(f"{'src_len':>8}  {'standard(s)':>12}  {'cached(s)':>10}  "
+                f"{'speedup':>8}  {'encoder%':>9}\n")
+        for src_len, r in results.items():
+            f.write(f"{src_len:>8}  {r['standard_s']:>12.3f}  {r['cached_s']:>10.3f}  "
+                    f"{r['speedup']:>7.2f}x  {r['encoder_pct']:>8.1f}%\n")
+    print(f"  Saved: {path}")
+# ── Task 2 ────────────────────────────────────────────────────────────
+def run_task2(model, src_tok, tgt_tok, device, input_text):
+    print("\n" + "="*65)
+    print("  TASK 2 — Attention Visualization + Semantic Drift")
+    print("="*65)
+    print(f"  Input: {input_text}")
+    if not hasattr(model.model, 'encode_source'):
+        print("  SKIP: not D3PMCrossAttention.")
+        return
+    src_ids    = src_tok.encode(input_text)
+    src_tensor = torch.tensor([src_ids], dtype=torch.long, device=device)
+    src_chars  = list(input_text.strip())
+    from analysis.attention_viz import (AttentionCapture, plot_attn_heatmap,
+                                         plot_attn_evolution, plot_all_layers)
+    from analysis.semantic_drift import (capture_intermediate_outputs,
+                                          compute_drift, compute_token_stability,
+                                          plot_drift_curve)
+    # Attention capture
+    print("  Capturing attention weights...")
+    capturer     = AttentionCapture(model)
+    step_weights = capturer.capture(src_tensor, capture_every=10)
+    with torch.no_grad():
+        out_ids  = model.generate_cached(src_tensor)
+    tgt_ids   = [x for x in out_ids[0].tolist() if x > 4]
+    tgt_text  = tgt_tok.decode(tgt_ids).strip()
+    tgt_chars = list(tgt_text)
+    print(f"  Output: {tgt_text}")
+    first_t = max(step_weights.keys())
+    plot_attn_heatmap(step_weights, t_val=first_t, layer=0,
+        src_tokens=src_chars[:20], tgt_tokens=tgt_chars[:20],
+        save_path=os.path.join(OUTPUT_DIR, f"task2_attn_t{first_t}.png"),
+        title=f"Attention t={first_t} (noisy)  Layer 0")
+    plot_attn_heatmap(step_weights, t_val=0, layer=0,
+        src_tokens=src_chars[:20], tgt_tokens=tgt_chars[:20],
+        save_path=os.path.join(OUTPUT_DIR, "task2_attn_t0.png"),
+        title="Attention t=0 (final)  Layer 0")
+    plot_all_layers(step_weights, t_val=0,
+        src_tokens=src_chars[:20], tgt_tokens=tgt_chars[:20],
+        save_path=os.path.join(OUTPUT_DIR, "task2_all_layers_t0.png"))
+    if len(src_chars) > 0 and len(tgt_chars) > 0:
+        plot_attn_evolution(step_weights, src_token_idx=0, tgt_token_idx=0,
+            layer=0, src_token_str=src_chars[0], tgt_token_str=tgt_chars[0],
+            save_path=os.path.join(OUTPUT_DIR, "task2_attn_evolution.png"))
+    # Semantic drift
+    print("  Computing semantic drift...")
+    step_outputs, final_out = capture_intermediate_outputs(
+        model, src_tensor, tgt_tok, capture_every=5)
+    drift   = compute_drift(step_outputs, final_out)
+    stab    = compute_token_stability(step_outputs, final_out, tgt_tok)
+    plot_drift_curve(drift, src_text=input_text,
+        save_path=os.path.join(OUTPUT_DIR, "task2_semantic_drift.png"))
+    print(f"  Lock-in timestep: t={drift['lock_in_t']}")
+    print(f"  Mean position lock-in: t={stab['mean_lock_t']:.1f} ± {stab['std_lock_t']:.1f}")
+    report = os.path.join(OUTPUT_DIR, "task2_report.txt")
+    with open(report, "w", encoding="utf-8") as f:
+        f.write("TASK 2 — ATTENTION + DRIFT REPORT\n" + "="*50 + "\n\n")
+        f.write(f"Input  : {input_text}\nOutput : {final_out}\n\n")
+        f.write(f"Lock-in t : {drift['lock_in_t']}\n")
+        f.write(f"Mean pos lock-in : {stab['mean_lock_t']:.1f} ± {stab['std_lock_t']:.1f}\n\n")
+        f.write("Step → Output → CER-to-final\n" + "-"*60 + "\n")
+        for tv, cer in zip(drift["t_vals"], drift["cer_to_final"]):
+            f.write(f"  t={tv:4d}  |  {step_outputs.get(tv,'')[:40]:40s}  |  {cer:.4f}\n")
+    print(f"  Report: {report}")
+# ── Task 3 ────────────────────────────────────────────────────────────
+def run_task3(model, src_tok, tgt_tok, device, src_list, ref_list):
+    print("\n" + "="*65)
+    print("  TASK 3 — Concept Vectors + PCA Steering")
+    print("="*65)
+    if not hasattr(model.model, 'encode_source'):
+        print("  SKIP: not D3PMCrossAttention.")
+        return
+    from analysis.concept_vectors import (collect_hidden_states, fit_pca,
+        find_diversity_direction, generate_diversity_spectrum, plot_pca_space)
+    # Collect hidden states from val set
+    n = min(500, len(src_list))
+    print(f"  Collecting hidden states from {n} examples...")
+    hidden, _ = collect_hidden_states(
+        model, src_list[:n], t_capture=0, max_samples=n)
+    # Compute output lengths for diversity direction
+    lengths = []
+    for src in src_list[:n]:
+        with torch.no_grad():
+            out = model.generate_cached(src.to(device))
+        ids = [x for x in out[0].tolist() if x > 4]
+        lengths.append(len(tgt_tok.decode(ids)))
+    # Fit PCA + find diversity direction
+    pca = fit_pca(hidden, n_components=min(50, n-1))
+    direction, best_pc, corr = find_diversity_direction(hidden, lengths, pca)
+    # Plot concept space
+    plot_pca_space(hidden, lengths, pca, best_pc,
+        save_path=os.path.join(OUTPUT_DIR, "task3_concept_space.png"))
+    # Generate diversity spectrum for first example
+    print("\n  Diversity spectrum for first example:")
+    src0  = src_list[0]
+    inp0  = src_tok.decode([x for x in src0[0].tolist() if x > 4])
+    print(f"  Input: {inp0}")
+    spectrum = generate_diversity_spectrum(
+        model, src0.to(device), direction, tgt_tok,
+        alphas=[-2.0, -1.0, 0.0, 1.0, 2.0])
+    # Save diversity direction + results
+    np.save(os.path.join(OUTPUT_DIR, "task3_diversity_direction.npy"), direction)
+    report = os.path.join(OUTPUT_DIR, "task3_report.txt")
+    with open(report, "w", encoding="utf-8") as f:
+        f.write("TASK 3 — CONCEPT VECTORS + PCA STEERING\n" + "="*50 + "\n\n")
+        f.write(f"PCA: {pca.n_components_} components, "
+                f"{pca.explained_variance_ratio_.sum()*100:.1f}% variance\n")
+        f.write(f"Diversity PC: {best_pc}  (|r|={corr:.3f} with output length)\n\n")
+        f.write("Diversity spectrum:\n")
+        for alpha, text in sorted(spectrum.items()):
+            f.write(f"  alpha={alpha:+.1f}  →  {text}\n")
+    print(f"  Report: {report}")
+# ── Task 4 ────────────────────────────────────────────────────────────
+def run_task4(phase, model, src_tok, tgt_tok, device, cfg,
+              src_list, ref_list):
+    print("\n" + "="*65)
+    print(f"  TASK 4 — Step Ablation  (phase={phase})")
+    print("="*65)
+    from analysis.step_ablation import (generate_ablation_configs,
+        run_ablation_analysis, plot_ablation_3d, run_adversarial_test)
+    if phase == "generate_configs":
+        print("  Generating ablation configs...")
+        generate_ablation_configs(output_dir="ablation_configs")
+        print("\n  NEXT STEPS:")
+        print("  1. bash ablation_configs/train_all.sh")
+        print("  2. python analysis/run_analysis.py --task 4 --phase analyze")
+    elif phase == "analyze":
+        # Check which models exist
+        existing = [T for T in [4, 8, 16, 32, 64]
+                    if os.path.exists(f"ablation_results/T{T}/best_model.pt")]
+        if not existing:
+            print("  No ablation models found at ablation_results/T*/best_model.pt")
+            print("  Run: python analysis/run_analysis.py --task 4 --phase generate_configs")
+            print("  Then: bash ablation_configs/train_all.sh")
+            return
+        print(f"  Found models for T={existing}")
+        results = run_ablation_analysis(
+            ablation_dir="ablation_results", base_cfg=cfg,
+            src_list=src_list[:200], ref_list=ref_list[:200],
+            tgt_tokenizer=tgt_tok, device=device,
+            output_dir=OUTPUT_DIR)
+        plot_ablation_3d(results,
+            save_path=os.path.join(OUTPUT_DIR, "task4_ablation_3d.png"))
+    # Adversarial robustness always runs on existing model (no retraining)
+    print("\n  Running adversarial robustness test...")
+    inp_texts = [src_tok.decode([x for x in s[0].tolist() if x > 4])
+                 for s in src_list[:50]]
+    run_adversarial_test(
+        model, src_tok, tgt_tok,
+        test_inputs=inp_texts, test_refs=ref_list[:50],
+        device=device, output_dir=OUTPUT_DIR)
+# ── Task 5 ────────────────────────────────────────────────────────────
+def run_task5(model, src_tok, tgt_tok, device, cfg, src_list, ref_list):
+    print("\n" + "="*65)
+    print("  TASK 5 — Classifier-Free Guidance")
+    print("="*65)
+    if not hasattr(model.model, 'encode_source'):
+        print("  SKIP: not D3PMCrossAttention.")
+        return
+    from analysis.quality_classifier import (
+        QualityClassifier, collect_quality_data,
+        train_quality_classifier, sweep_guidance_scales)
+    clf_path = os.path.join(OUTPUT_DIR, "task5_quality_classifier.pt")
+    d_model  = cfg['model']['d_model']
+    # Step 1: collect or load training data
+    data_path = os.path.join(OUTPUT_DIR, "task5_quality_data.npz")
+    if os.path.exists(data_path):
+        print("  Loading cached quality data...")
+        data    = np.load(data_path)
+        hidden  = data["hidden"]
+        quality = data["quality"]
+    else:
+        print("  Collecting quality data (this takes a few minutes)...")
+        n       = min(2000, len(src_list))
+        hidden, quality = collect_quality_data(
+            model, src_list[:n], ref_list[:n], tgt_tok,
+            t_capture=0, max_samples=n)
+        np.savez(data_path, hidden=hidden, quality=quality)
+        print(f"  Saved quality data: {data_path}")
+    # Step 2: train or load classifier
+    if os.path.exists(clf_path):
+        print(f"  Loading cached classifier: {clf_path}")
+        clf = QualityClassifier(d_model)
+        clf.load_state_dict(torch.load(clf_path, map_location='cpu'))
+        clf.eval()
+    else:
+        print("  Training quality classifier...")
+        clf = train_quality_classifier(
+            hidden, quality, d_model=d_model,
+            epochs=30, batch_size=64, lr=1e-3,
+            save_path=clf_path)
+        clf.eval()
+    # Step 3: guidance scale sweep
+    print("\n  Guidance scale sweep (λ ∈ {0.0, 0.5, 1.0, 1.5, 2.0, 3.0})...")
+    n_sweep = min(50, len(src_list))
+    results = sweep_guidance_scales(
+        model, clf, src_list[:n_sweep], ref_list[:n_sweep],
+        tgt_tok, scales=[0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
+        n_samples=n_sweep, device=device, output_dir=OUTPUT_DIR)
+    # Find optimal scale
+    best_scale = min(results, key=lambda s: results[s]["mean_cer"])
+    print(f"\n  Optimal guidance scale: λ={best_scale:.1f}  "
+          f"CER={results[best_scale]['mean_cer']:.4f}")
+    report = os.path.join(OUTPUT_DIR, "task5_report.txt")
+    with open(report, "w") as f:
+        f.write("TASK 5 — CLASSIFIER-FREE GUIDANCE\n" + "="*50 + "\n\n")
+        f.write(f"Classifier params: {sum(p.numel() for p in clf.parameters())}\n")
+        f.write(f"Training samples : {len(hidden)}\n\n")
+        f.write("Guidance scale sweep:\n")
+        f.write(f"  {'λ':>6}  {'CER':>8}  {'diversity':>10}\n")
+        f.write("  " + "-"*28 + "\n")
+        for s in sorted(results.keys()):
+            r = results[s]
+            marker = " ← optimal" if s == best_scale else ""
+            f.write(f"  {s:>6.1f}  {r['mean_cer']:>8.4f}  {r['diversity']:>10.3f}{marker}\n")
+    print(f"  Report: {report}")
+# ── Main ──────────────────────────────────────────────────────────────
+def main():
+    global OUTPUT_DIR
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task",
+        choices=["1","2","3","4","5","all"], default="all")
+    parser.add_argument("--input",
+        default="dharmo rakṣati rakṣitaḥ",
+        help="IAST input text for Task 2")
+    parser.add_argument("--phase",
+        choices=["generate_configs", "analyze"], default="analyze",
+        help="Task 4 phase: generate_configs (before training) or analyze (after)")
+    parser.add_argument("--checkpoint", default=None,
+        help="Optional explicit checkpoint path")
+    parser.add_argument("--output_dir", default="analysis/outputs",
+        help="Output directory for reports/figures")
+    args = parser.parse_args()
+    OUTPUT_DIR = args.output_dir
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    cfg = copy.deepcopy(CONFIG)
+    if args.checkpoint:
+        cfg["model_type"] = infer_model_type_from_checkpoint(args.checkpoint)
+        cfg["data"]["include_negative_examples"] = infer_include_negative_from_checkpoint(args.checkpoint)
+        ckpt_name = os.path.basename(os.path.dirname(args.checkpoint))
+        if ckpt_name.startswith("T") and ckpt_name[1:].isdigit():
+            t_val = int(ckpt_name[1:])
+            cfg["model"]["diffusion_steps"] = t_val
+            cfg["inference"]["num_steps"] = t_val
+    requested = cfg["training"]["device"]
+    if requested == "mps" and not torch.backends.mps.is_available():
+        requested = "cpu"
+    elif requested == "cuda" and not torch.cuda.is_available():
+        requested = "cpu"
+    cfg["training"]["device"] = requested
+    device = torch.device(requested)
+    print("Loading model and tokenizers...")
+    model, src_tok, tgt_tok, cfg = load_everything(cfg, device, ckpt_override=args.checkpoint)
+    # Load val data for tasks that need it (Tasks 3, 4, 5)
+    needs_data = args.task in ("3", "4", "5", "all")
+    if needs_data:
+        print("Loading validation data...")
+        src_list, ref_list, inp_list = load_val_data(cfg, src_tok, tgt_tok, n=500)
+    else:
+        src_list, ref_list, inp_list = [], [], []
+    tasks = (["1","2","3","4","5"] if args.task == "all"
+             else [args.task])
+    for task in tasks:
+        if task == "1":
+            run_task1(model, src_tok, device)
+        elif task == "2":
+            run_task2(model, src_tok, tgt_tok, device, args.input)
+        elif task == "3":
+            run_task3(model, src_tok, tgt_tok, device, src_list, ref_list)
+        elif task == "4":
+            run_task4(args.phase, model, src_tok, tgt_tok, device, cfg,
+                      src_list, ref_list)
+        elif task == "5":
+            run_task5(model, src_tok, tgt_tok, device, cfg, src_list, ref_list)
+    print(f"\n{'='*65}")
+    print(f"  All outputs saved to: {OUTPUT_DIR}/")
+    print("="*65)
+if __name__ == "__main__":
+    main()

analysis/run_tasks_except4_all_models.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Run Tasks 1,2,3,5 for every available checkpoint (excluding Task 4).
+Usage:
+  python analysis/run_tasks_except4_all_models.py
+  python analysis/run_tasks_except4_all_models.py --input "dharmo rakṣati rakṣitaḥ"
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+DEFAULT_OUT_ROOT = ROOT / "analysis" / "outputs_multi"
+def discover_checkpoints() -> list[Path]:
+    roots = [ROOT / "results", ROOT / "results7", ROOT / "ablation_results"]
+    out: list[Path] = []
+    for base in roots:
+        if not base.exists():
+            continue
+        for ckpt in sorted(base.glob("*/best_model.pt")):
+            out.append(ckpt)
+    return out
+def slug_for_checkpoint(ckpt: Path) -> str:
+    root = ckpt.parent.parent.name
+    exp = ckpt.parent.name
+    return f"{root}__{exp}"
+def run_task(task: str, ckpt: Path, input_text: str, out_dir: Path) -> tuple[int, float]:
+    cmd = [
+        sys.executable,
+        str(ROOT / "analysis" / "run_analysis.py"),
+        "--task", task,
+        "--checkpoint", str(ckpt),
+        "--output_dir", str(out_dir),
+    ]
+    if task == "2":
+        cmd.extend(["--input", input_text])
+    start = datetime.now()
+    env = os.environ.copy()
+    env.setdefault("HF_HOME", "/tmp/hf_home")
+    env.setdefault("HF_DATASETS_CACHE", "/tmp/hf_datasets")
+    env.setdefault("HF_HUB_CACHE", "/tmp/hf_hub")
+    env.setdefault("TRANSFORMERS_CACHE", "/tmp/hf_transformers")
+    os.makedirs(env["HF_HOME"], exist_ok=True)
+    os.makedirs(env["HF_DATASETS_CACHE"], exist_ok=True)
+    os.makedirs(env["HF_HUB_CACHE"], exist_ok=True)
+    os.makedirs(env["TRANSFORMERS_CACHE"], exist_ok=True)
+    proc = subprocess.run(cmd, cwd=str(ROOT), env=env)
+    seconds = (datetime.now() - start).total_seconds()
+    return proc.returncode, seconds
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", default="dharmo rakṣati rakṣitaḥ")
+    parser.add_argument("--out_root", default=str(DEFAULT_OUT_ROOT))
+    args = parser.parse_args()
+    checkpoints = discover_checkpoints()
+    if not checkpoints:
+        raise FileNotFoundError("No checkpoints found under results/results7/ablation_results.")
+    out_root = Path(args.out_root)
+    out_root.mkdir(parents=True, exist_ok=True)
+    tasks = ["1", "2", "3", "5"]
+    summary = {
+        "timestamp": datetime.now().isoformat(timespec="seconds"),
+        "tasks": tasks,
+        "checkpoints": [],
+    }
+    for ckpt in checkpoints:
+        slug = slug_for_checkpoint(ckpt)
+        model_out = out_root / slug
+        model_out.mkdir(parents=True, exist_ok=True)
+        print(f"\n=== Checkpoint: {ckpt} ===")
+        model_item = {
+            "checkpoint": str(ckpt),
+            "output_dir": str(model_out),
+            "tasks": [],
+        }
+        for task in tasks:
+            task_out = model_out / f"task{task}"
+            task_out.mkdir(parents=True, exist_ok=True)
+            print(f"-> Running task {task} ...")
+            code, sec = run_task(task, ckpt, args.input, task_out)
+            item = {
+                "task": task,
+                "exit_code": code,
+                "seconds": round(sec, 2),
+                "output_dir": str(task_out),
+            }
+            model_item["tasks"].append(item)
+            status = "OK" if code == 0 else "FAILED"
+            print(f"   {status} ({sec:.1f}s)")
+        summary["checkpoints"].append(model_item)
+    summary_path = out_root / "summary.json"
+    with summary_path.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, ensure_ascii=False, indent=2)
+    print(f"\nSaved summary: {summary_path}")
+if __name__ == "__main__":
+    main()

analysis/semantic_drift.py ADDED Viewed

	@@ -0,0 +1,569 @@

+# """
+# analysis/semantic_drift.py
+# ===========================
+# Task 2: Semantic drift metric — how much does the intermediate generation
+# diverge from the final output as we walk through diffusion steps T → 0?
+#
+# Metric: CER between x0_estimate at each step vs the final x0 at t=0.
+#
+# A well-trained model should show:
+#   - High drift at t=T-1 (near-random initial estimate)
+#   - Rapid decrease in drift around t=T//2 (model finds the right structure)
+#   - Near-zero drift at t=10 (output is stable, only fine corrections remain)
+#
+# If drift stays high until t=5 then suddenly collapses → model is doing all
+# its work in the last few steps → consider reducing T.
+#
+# Also measures:
+#   - Token stability: fraction of positions that don't change between steps
+#   - Lock-in time: first step where each position "commits" to its final token
+#
+# No retraining required. Uses generate_cached() with intermediate snapshots.
+# """
+#
+# import torch
+# import torch.nn.functional as F
+# import numpy as np
+# from typing import List, Dict, Optional, Tuple
+#
+#
+# def compute_cer_between(pred: str, ref: str) -> float:
+#     """CER between two strings."""
+#     if not ref:
+#         return 1.0 if pred else 0.0
+#
+#     def edit_distance(s1, s2):
+#         m, n = len(s1), len(s2)
+#         dp = list(range(n + 1))
+#         for i in range(1, m + 1):
+#             prev, dp[0] = dp[0], i
+#             for j in range(1, n + 1):
+#                 temp = dp[j]
+#                 dp[j] = prev if s1[i-1] == s2[j-1] else 1 + min(prev, dp[j], dp[j-1])
+#                 prev = temp
+#         return dp[n]
+#
+#     return edit_distance(pred, ref) / len(ref)
+#
+#
+# @torch.no_grad()
+# def capture_intermediate_outputs(
+#     model,
+#     src:          torch.Tensor,
+#     tgt_tokenizer,
+#     capture_every: int = 5,
+#     temperature:   float = 0.8,
+#     top_k:         int   = 40,
+# ) -> Tuple[Dict[int, str], str]:
+#     """
+#     Run generation while recording the decoded x0_estimate at every
+#     `capture_every` diffusion steps.
+#
+#     Args:
+#         model         : SanskritModel (D3PMCrossAttention)
+#         src           : [1, src_len] IAST token ids (single sample)
+#         tgt_tokenizer : SanskritTargetTokenizer for decoding intermediate outputs
+#         capture_every : record every N steps
+#         temperature   : sampling temperature
+#         top_k         : top-k filter
+#
+#     Returns:
+#         step_outputs : dict mapping t_val → decoded Devanagari string at that step
+#         final_output : decoded string at t=0 (final result)
+#     """
+#     if src.dim() == 1:
+#         src = src.unsqueeze(0)
+#
+#     inner  = model.model
+#     T      = inner.scheduler.num_timesteps
+#     device = src.device
+#
+#     # Encode source once (KV cache)
+#     memory, src_pad_mask = inner.encode_source(src)
+#
+#     B       = src.shape[0]
+#     tgt_len = inner.max_seq_len
+#     mask_id = inner.mask_token_id
+#
+#     x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device)
+#     hint   = None
+#
+#     step_outputs: Dict[int, str] = {}
+#     inner.eval()
+#
+#     for t_val in range(T - 1, -1, -1):
+#         t       = torch.full((B,), t_val, dtype=torch.long, device=device)
+#         is_last = (t_val == 0)
+#
+#         logits, _ = inner.forward_cached(
+#             memory, src_pad_mask, x0_est, t,
+#             x0_hint=hint, inference_mode=True,
+#         )
+#
+#         logits = logits / max(temperature, 1e-8)
+#         if top_k > 0:
+#             V = logits.shape[-1]
+#             if top_k < V:
+#                 topk_vals, _ = torch.topk(logits, top_k, dim=-1)
+#                 threshold    = topk_vals[..., -1].unsqueeze(-1)
+#                 logits       = logits.masked_fill(logits < threshold, float('-inf'))
+#
+#         probs  = F.softmax(logits, dim=-1)
+#         x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs)
+#         hint   = x0_est
+#
+#         # Capture at this step
+#         if (T - 1 - t_val) % capture_every == 0 or is_last:
+#             ids  = [x for x in x0_est[0].tolist() if x > 4]
+#             text = tgt_tokenizer.decode(ids).strip()
+#             step_outputs[t_val] = text
+#
+#     final_output = step_outputs.get(0, "")
+#     return step_outputs, final_output
+#
+#
+# def _sample(probs):
+#     B, L, V = probs.shape
+#     flat    = probs.view(B * L, V).clamp(min=1e-9)
+#     flat    = flat / flat.sum(dim=-1, keepdim=True)
+#     return torch.multinomial(flat, 1).squeeze(-1).view(B, L)
+#
+#
+# def compute_drift(
+#     step_outputs:  Dict[int, str],
+#     final_output:  str,
+# ) -> Dict[str, object]:
+#     """
+#     Compute drift metrics comparing each intermediate output to the final.
+#
+#     Returns dict with:
+#       t_vals      : list of captured timesteps (T-1 → 0)
+#       cer_to_final: CER between each step's output and the final output
+#                     0.0 = identical to final, 1.0 = completely different
+#       lock_in_t   : first t_val where CER drops and stays below 0.1
+#                     (step at which output "commits" to final form)
+#     """
+#     t_vals       = sorted(step_outputs.keys(), reverse=True)   # T-1 → 0
+#     cer_to_final = []
+#
+#     for t_val in t_vals:
+#         cer = compute_cer_between(step_outputs[t_val], final_output)
+#         cer_to_final.append(cer)
+#
+#     # Find lock-in: first step where CER stays below threshold for rest of run
+#     threshold = 0.1
+#     lock_in_t = 0   # default: never locked in early
+#     for i, (t_val, cer) in enumerate(zip(t_vals, cer_to_final)):
+#         if all(c <= threshold for c in cer_to_final[i:]):
+#             lock_in_t = t_val
+#             break
+#
+#     return {
+#         "t_vals":       t_vals,
+#         "cer_to_final": cer_to_final,
+#         "lock_in_t":    lock_in_t,
+#         "final_output": final_output,
+#     }
+#
+#
+# def compute_token_stability(
+#     step_outputs:  Dict[int, str],
+#     final_output:  str,
+#     tgt_tokenizer,
+# ) -> Dict[str, object]:
+#     """
+#     Token-level stability: for each position, at which diffusion step
+#     does it first match its final token and stay matched?
+#
+#     Returns:
+#       position_lock_times: list of t_val at which each position locks in
+#       mean_lock_t        : average lock-in timestep across positions
+#     """
+#     T      = max(step_outputs.keys())
+#     t_vals = sorted(step_outputs.keys(), reverse=True)   # T-1 → 0
+#
+#     # Encode all intermediate outputs and the final
+#     def encode(text):
+#         return tgt_tokenizer.encode(text)
+#
+#     final_ids = encode(final_output)
+#     L         = len(final_ids)
+#
+#     # Build matrix: [n_steps, L]
+#     step_ids = []
+#     for t_val in t_vals:
+#         step_ids.append(encode(step_outputs.get(t_val, "")))
+#
+#     # Pad all to same length
+#     max_len = max(len(s) for s in step_ids)
+#     step_ids = [s + [1] * (max_len - len(s)) for s in step_ids]   # 1=PAD
+#     final_ids_padded = final_ids + [1] * (max_len - len(final_ids))
+#
+#     step_arr  = np.array(step_ids)                # [n_steps, L]
+#     final_arr = np.array(final_ids_padded)         # [L]
+#
+#     # For each position: find first step index where it matches final
+#     # and stays matched for all subsequent steps
+#     position_lock_steps = []
+#     for pos in range(min(L, max_len)):
+#         col = step_arr[:, pos]   # [n_steps]
+#         fin = final_arr[pos]
+#         locked_at = len(t_vals) - 1   # default: never locks early
+#         for i in range(len(t_vals)):
+#             if all(col[i:] == fin):
+#                 locked_at = i
+#                 break
+#         position_lock_steps.append(t_vals[locked_at] if locked_at < len(t_vals) else 0)
+#
+#     return {
+#         "position_lock_times": position_lock_steps,
+#         "mean_lock_t":         float(np.mean(position_lock_steps)),
+#         "std_lock_t":          float(np.std(position_lock_steps)),
+#     }
+#
+#
+# def plot_drift_curve(
+#     drift_result: Dict,
+#     src_text:     str = "",
+#     save_path:    Optional[str] = None,
+# ):
+#     """
+#     Plot CER-to-final vs diffusion step.
+#     Shows where the model "commits" to the final output.
+#     """
+#     try:
+#         import matplotlib.pyplot as plt
+#     except ImportError:
+#         print("pip install matplotlib.")
+#         return
+#
+#     t_vals  = drift_result["t_vals"]
+#     cers    = drift_result["cer_to_final"]
+#     lock_t  = drift_result["lock_in_t"]
+#
+#     fig, ax = plt.subplots(figsize=(12, 4))
+#     ax.plot(range(len(t_vals)), cers, linewidth=1.8, color='coral', label='CER to final')
+#     ax.fill_between(range(len(t_vals)), cers, alpha=0.15, color='coral')
+#
+#     # Mark lock-in point
+#     if lock_t in t_vals:
+#         lock_idx = t_vals.index(lock_t)
+#         ax.axvline(lock_idx, color='steelblue', linestyle='--', linewidth=1.2,
+#                    label=f"Lock-in at t={lock_t}")
+#
+#     ax.axhline(0.1, color='gray', linestyle=':', linewidth=1, alpha=0.7)
+#
+#     n = len(t_vals)
+#     tick_positions = list(range(0, n, max(1, n // 10)))
+#     ax.set_xticks(tick_positions)
+#     ax.set_xticklabels([str(t_vals[i]) for i in tick_positions], fontsize=8)
+#     ax.set_xlabel("Diffusion step t  (T-1 → 0)", fontsize=11)
+#     ax.set_ylabel("CER vs final output", fontsize=11)
+#     ax.set_ylim(0, 1.05)
+#     ax.set_xlim(0, n - 1)
+#     ax.legend(fontsize=10)
+#
+#     title = f"Semantic drift"
+#     if src_text:
+#         title += f"  |  src: {src_text[:50]}"
+#     ax.set_title(title, fontsize=11)
+#     plt.tight_layout()
+#
+#     if save_path:
+#         import os
+#         os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+#         plt.savefig(save_path, dpi=150, bbox_inches='tight')
+#         print(f"Saved: {save_path}")
+#     else:
+#         plt.show()
+#     plt.close()
+# ============================================================
+# TASK 2: Source–Paraphrase Semantic Alignment Trajectory
+# ============================================================
+import torch
+import torch.nn.functional as F
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Dict, List, Tuple
+from collections import defaultdict
+# Optional (install if needed)
+# pip install bert-score scikit-learn
+from bert_score import score as bertscore
+from sklearn.feature_extraction.text import TfidfVectorizer
+# ============================================================
+# ------------------ ATTENTION HOOK --------------------------
+# ============================================================
+def register_attention_hooks(model):
+    """
+    Registers forward hooks to capture cross-attention weights
+    from each decoder block.
+    Assumes each block has attribute `.cross_attn.attn_weights`
+    """
+    inner = model.model
+    attention_maps = []
+    def hook_fn(module, input, output):
+        if hasattr(module, "attn_weights"):
+            attention_maps.append(module.attn_weights.detach().cpu())
+    hooks = []
+    for block in inner.decoder_blocks:
+        if hasattr(block, "cross_attn"):
+            h = block.cross_attn.register_forward_hook(hook_fn)
+            hooks.append(h)
+    return hooks, attention_maps
+# ============================================================
+# ------------------ CAPTURE TRAJECTORY ----------------------
+# ============================================================
+@torch.no_grad()
+def capture_alignment_trajectory(
+    model,
+    src_tensor: torch.Tensor,
+    src_text: str,
+    tgt_tokenizer,
+    steps_to_capture: List[int] = None,
+):
+    """
+    Capture:
+      - intermediate outputs
+      - cross-attention maps
+      - BERTScore vs source
+    Returns:
+      dict with outputs, attention, drift
+    """
+    inner = model.model
+    device = src_tensor.device
+    T = inner.scheduler.num_timesteps
+    if steps_to_capture is None:
+        steps_to_capture = list(range(T - 1, -1, -5)) + [0]
+    # Register hooks
+    hooks, attn_storage = register_attention_hooks(model)
+    memory, src_pad_mask = inner.encode_source(src_tensor)
+    B = src_tensor.shape[0]
+    tgt_len = inner.max_seq_len
+    mask_id = inner.mask_token_id
+    x0_est = torch.full((B, tgt_len), mask_id, device=device)
+    hint = None
+    outputs = {}
+    attention_per_step = {}
+    for t_val in range(T - 1, -1, -1):
+        t = torch.full((B,), t_val, device=device)
+        logits, _ = inner.forward_cached(
+            memory, src_pad_mask, x0_est, t,
+            x0_hint=hint, inference_mode=True
+        )
+        probs = F.softmax(logits, dim=-1)
+        x0_est = torch.argmax(probs, dim=-1)
+        hint = x0_est
+        if t_val in steps_to_capture:
+            ids = [x for x in x0_est[0].tolist() if x > 4]
+            text = tgt_tokenizer.decode(ids)
+            outputs[t_val] = text
+            # Collect attention maps (last layer only for simplicity)
+            if len(attn_storage) > 0:
+                attention_per_step[t_val] = attn_storage[-1].numpy()
+    # Remove hooks
+    for h in hooks:
+        h.remove()
+    # Compute BERTScore trajectory
+    bert_scores = compute_bert_alignment(src_text, outputs)
+    return {
+        "outputs": outputs,
+        "attention": attention_per_step,
+        "bert_scores": bert_scores,
+    }
+# ============================================================
+# ------------------ BERTScore -------------------------------
+# ============================================================
+def compute_bert_alignment(src_text: str, outputs: Dict[int, str]):
+    """
+    Compute BERTScore between source and each intermediate output
+    """
+    scores = {}
+    for t, text in outputs.items():
+        P, R, F1 = bertscore([text], [src_text], lang="hi", verbose=False)
+        scores[t] = float(F1.mean())
+    return scores
+# ============================================================
+# ------------------ SEMANTIC DRIFT --------------------------
+# ============================================================
+def compute_semantic_drift(bert_scores: Dict[int, float]):
+    """
+    Drift = drop from best alignment
+    """
+    max_score = max(bert_scores.values())
+    drift = {t: max_score - s for t, s in bert_scores.items()}
+    return drift
+# ============================================================
+# ------------------ ATTENTION STABILITY ---------------------
+# ============================================================
+def compute_attention_stability(attention_maps: Dict[int, np.ndarray]):
+    """
+    Measures if tokens attend consistently across steps.
+    """
+    steps = sorted(attention_maps.keys(), reverse=True)
+    stability_scores = []
+    for i in range(len(steps) - 1):
+        A = attention_maps[steps[i]]
+        B = attention_maps[steps[i+1]]
+        diff = np.abs(A - B).mean()
+        stability_scores.append(diff)
+    return np.mean(stability_scores)
+# ============================================================
+# ------------------ TF-IDF vs STABILITY ---------------------
+# ============================================================
+def compute_tfidf_attention_correlation(
+    src_texts: List[str],
+    attention_maps_list: List[Dict[int, np.ndarray]]
+):
+    """
+    Correlate TF-IDF importance with attention stability
+    """
+    vectorizer = TfidfVectorizer()
+    tfidf = vectorizer.fit_transform(src_texts).toarray()
+    word_importance = tfidf.mean(axis=0)
+    stability = []
+    for attn_maps in attention_maps_list:
+        stability.append(compute_attention_stability(attn_maps))
+    corr = np.corrcoef(word_importance[:len(stability)], stability)[0, 1]
+    return corr
+# ============================================================
+# ------------------ HEATMAP VISUALIZATION -------------------
+# ============================================================
+def plot_attention_heatmap(attn: np.ndarray, title="Attention"):
+    """
+    Plot cross-attention heatmap
+    attn: [tgt_len, src_len]
+    """
+    plt.figure(figsize=(6,5))
+    plt.imshow(attn, aspect='auto', cmap='viridis')
+    plt.colorbar()
+    plt.title(title)
+    plt.xlabel("Source tokens")
+    plt.ylabel("Target tokens")
+    plt.show()
+def visualize_trajectory(attention_maps: Dict[int, np.ndarray]):
+    """
+    Show attention evolution over time
+    """
+    steps = sorted(attention_maps.keys(), reverse=True)
+    for t in steps[:5]:  # show 5 steps
+        plot_attention_heatmap(attention_maps[t], title=f"Step t={t}")
+# ============================================================
+# ------------------ LOCKED vs FLEXIBLE ----------------------
+# ============================================================
+def analyze_token_behavior(attention_maps: Dict[int, np.ndarray]):
+    """
+    Detect whether tokens are locked or flexible
+    """
+    steps = sorted(attention_maps.keys(), reverse=True)
+    first = attention_maps[steps[0]]
+    last = attention_maps[steps[-1]]
+    diff = np.abs(first - last).mean(axis=1)
+    locked = np.where(diff < 0.05)[0]
+    flexible = np.where(diff >= 0.05)[0]
+    return {
+        "locked_tokens": locked.tolist(),
+        "flexible_tokens": flexible.tolist()
+    }
+# ============================================================
+# ------------------ MASTER FUNCTION -------------------------
+# ============================================================
+def run_task2_analysis(
+    model,
+    src_tensor,
+    src_text,
+    tgt_tokenizer
+):
+    result = capture_alignment_trajectory(
+        model, src_tensor, src_text, tgt_tokenizer
+    )
+    drift = compute_semantic_drift(result["bert_scores"])
+    stability = compute_attention_stability(result["attention"])
+    behavior = analyze_token_behavior(result["attention"])
+    print("\nBERTScore trajectory:")
+    print(result["bert_scores"])
+    print("\nSemantic drift:")
+    print(drift)
+    print(f"\nAttention stability: {stability:.4f}")
+    print("\nToken behavior:")
+    print(behavior)
+    visualize_trajectory(result["attention"])
+    return {
+        "trajectory": result,
+        "drift": drift,
+        "stability": stability,
+        "behavior": behavior
+    }

analysis/step_ablation.py ADDED Viewed

	@@ -0,0 +1,582 @@

+# """
+# analysis/step_ablation.py
+# ==========================
+# Task 4: Semantic Robustness — Ablation of Diffusion Steps vs Meaning Preservation
+#
+# Two-phase workflow (retraining IS required for different T values):
+#
+#   PHASE 1 — Generate configs + train (run once per T value):
+#     python analysis/step_ablation.py --phase generate_configs
+#     # Creates configs: ablation_configs/T4.py, T8.py, T16.py, T32.py, T64.py
+#     # Then train each: MODEL_TYPE=d3pm_cross_attention python train.py  (for each config)
+#
+#   PHASE 2 — Analyze trained models (no retraining needed):
+#     python analysis/step_ablation.py --phase analyze
+#     # Loads each trained model, generates 200 paraphrases, computes CER
+#     # Produces 3D plot: X=steps, Y=generation_speed, Z=CER
+#
+# Why retraining is needed:
+#   A model trained with T=128 learns to denoise from x_t~Uniform[0,128].
+#   Running it with T=4 means the model only sees t∈{0,1,2,3} — which it
+#   was never trained on at those scales. Outputs are meaningless.
+#   You must train a separate model for each T value.
+#
+# Also implements adversarial robustness test (no retraining):
+#   Takes your existing T=128 model and tests whether corrupted IAST
+#   inputs (typos, character swaps) cause proportional output degradation.
+# """
+#
+# import torch
+# import torch.nn.functional as F
+# import numpy as np
+# import os
+# import sys
+# import time
+# import json
+# import copy
+# from typing import List, Dict, Optional
+#
+# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+#
+#
+# # ── Phase 1: Config generation ────────────────────────────────────────
+#
+# T_VALUES = [4, 8, 16, 32, 64]
+#
+# def generate_ablation_configs(base_config_path: str = "config.py",
+#                                output_dir: str = "ablation_configs"):
+#     """
+#     Generate one config file per T value.
+#     Each config is a copy of the base config with diffusion_steps changed.
+#
+#     After running this, train each model:
+#         for T in 4 8 16 32 64; do
+#             cp ablation_configs/config_T${T}.py config.py
+#             python train.py
+#             mv results7/d3pm_cross_attention_neg_False \
+#                ablation_results/T${T}
+#         done
+#     """
+#     os.makedirs(output_dir, exist_ok=True)
+#
+#     # Read base config
+#     with open(base_config_path, "r") as f:
+#         base_src = f.read()
+#
+#     for T in T_VALUES:
+#         # Replace diffusion_steps and num_steps
+#         cfg_src = base_src
+#         cfg_src = cfg_src.replace(
+#             '"diffusion_steps": 128',
+#             f'"diffusion_steps": {T}'
+#         )
+#         cfg_src = cfg_src.replace(
+#             "'diffusion_steps': 128",
+#             f"'diffusion_steps': {T}"
+#         )
+#         cfg_src = cfg_src.replace(
+#             '"num_steps": 128',
+#             f'"num_steps": {T}'
+#         )
+#         cfg_src = cfg_src.replace(
+#             "'num_steps': 128",
+#             f"'num_steps': {T}"
+#         )
+#         out_path = os.path.join(output_dir, f"config_T{T}.py")
+#         with open(out_path, "w") as f:
+#             f.write(f"# Ablation config: T={T} diffusion steps\n")
+#             f.write(cfg_src)
+#         print(f"  Wrote: {out_path}")
+#
+#     # Write a shell script to train all
+#     shell_script = os.path.join(output_dir, "train_all.sh")
+#     with open(shell_script, "w") as f:
+#         f.write("#!/bin/bash\n")
+#         f.write("# Run this script to train all ablation models\n\n")
+#         for T in T_VALUES:
+#             f.write(f"echo '=== Training T={T} ==='\n")
+#             f.write(f"cp {output_dir}/config_T{T}.py config.py\n")
+#             f.write(f"python train.py\n")
+#             f.write(f"mkdir -p ablation_results/T{T}\n")
+#             f.write(f"cp -r results7/d3pm_cross_attention_neg_False/best_model.pt "
+#                     f"ablation_results/T{T}/best_model.pt\n")
+#             f.write(f"cp -r results7/d3pm_cross_attention_neg_False/train.log "
+#                     f"ablation_results/T{T}/train.log\n\n")
+#     os.chmod(shell_script, 0o755)
+#     print(f"\nTraining script: {shell_script}")
+#     print(f"Run: bash {shell_script}")
+#
+#
+# # ── Phase 2: Analysis (after models are trained) ──────────────────────
+#
+# def compute_cer(pred: str, ref: str) -> float:
+#     if not ref:
+#         return 1.0
+#
+#     def edit_distance(s1, s2):
+#         m, n = len(s1), len(s2)
+#         dp = list(range(n + 1))
+#         for i in range(1, m + 1):
+#             prev, dp[0] = dp[0], i
+#             for j in range(1, n + 1):
+#                 temp = dp[j]
+#                 dp[j] = prev if s1[i-1] == s2[j-1] else 1 + min(prev, dp[j], dp[j-1])
+#                 prev = temp
+#         return dp[n]
+#
+#     return edit_distance(pred, ref) / max(len(ref), 1)
+#
+#
+# def evaluate_model(
+#     model,
+#     src_list:      List[torch.Tensor],
+#     ref_list:      List[str],
+#     tgt_tokenizer,
+#     n_samples:     int   = 200,
+#     temperature:   float = 0.8,
+#     top_k:         int   = 40,
+# ) -> Dict:
+#     """
+#     Generate n_samples outputs and compute CER + generation speed.
+#
+#     Returns dict with:
+#         mean_cer      : average CER over samples
+#         generation_s  : total wall-clock seconds for all generations
+#         speed_per_sample: seconds per sample
+#         cer_list      : per-sample CER values
+#     """
+#     device   = next(model.parameters()).device
+#     n        = min(n_samples, len(src_list))
+#     cer_list = []
+#
+#     start = time.perf_counter()
+#     for i, (src, ref) in enumerate(zip(src_list[:n], ref_list[:n])):
+#         if src.dim() == 1:
+#             src = src.unsqueeze(0)
+#
+#         with torch.no_grad():
+#             if hasattr(model.model, 'generate_cached'):
+#                 out = model.model.generate_cached(
+#                     src.to(device), temperature=temperature, top_k=top_k
+#                 )
+#             else:
+#                 out = model.generate(
+#                     src.to(device), temperature=temperature, top_k=top_k
+#                 )
+#
+#         ids  = [x for x in out[0].tolist() if x > 4]
+#         pred = tgt_tokenizer.decode(ids).strip()
+#         cer  = compute_cer(pred, ref)
+#         cer_list.append(cer)
+#
+#     elapsed = time.perf_counter() - start
+#
+#     return {
+#         "mean_cer":          float(np.mean(cer_list)),
+#         "std_cer":           float(np.std(cer_list)),
+#         "generation_s":      elapsed,
+#         "speed_per_sample":  elapsed / max(n, 1),
+#         "cer_list":          cer_list,
+#         "n_samples":         n,
+#     }
+#
+#
+# def run_ablation_analysis(
+#     ablation_dir:  str = "ablation_results",
+#     base_cfg:      dict = None,
+#     src_list:      List[torch.Tensor] = None,
+#     ref_list:      List[str] = None,
+#     tgt_tokenizer  = None,
+#     device:        torch.device = None,
+#     output_dir:    str = "analysis/outputs",
+# ) -> Dict:
+#     """
+#     Load each trained model and evaluate.
+#     Produces results dict and 3D plot.
+#
+#     Expects ablation_results/T{N}/best_model.pt for each T in T_VALUES.
+#     """
+#     from inference import load_model
+#
+#     results = {}
+#     for T in T_VALUES:
+#         ckpt = os.path.join(ablation_dir, f"T{T}", "best_model.pt")
+#         if not os.path.exists(ckpt):
+#             print(f"  SKIP T={T}: no checkpoint at {ckpt}")
+#             continue
+#
+#         print(f"\nEvaluating T={T}...")
+#         cfg_T = copy.deepcopy(base_cfg)
+#         cfg_T['model']['diffusion_steps'] = T
+#         cfg_T['inference']['num_steps']   = T
+#
+#         model, cfg_T = load_model(ckpt, cfg_T, device)
+#         model.eval()
+#
+#         metrics = evaluate_model(
+#             model, src_list, ref_list, tgt_tokenizer, n_samples=200
+#         )
+#         results[T] = metrics
+#         print(f"  T={T}  CER={metrics['mean_cer']:.4f}  "
+#               f"speed={metrics['speed_per_sample']:.3f}s/sample")
+#
+#         del model
+#
+#     # Save results
+#     os.makedirs(output_dir, exist_ok=True)
+#     results_path = os.path.join(output_dir, "ablation_results.json")
+#     with open(results_path, "w") as f:
+#         json.dump({str(k): {kk: vv for kk, vv in v.items() if kk != 'cer_list'}
+#                    for k, v in results.items()}, f, indent=2)
+#     print(f"\nResults saved: {results_path}")
+#
+#     return results
+#
+#
+# def plot_ablation_3d(
+#     results:   Dict,
+#     save_path: Optional[str] = None,
+# ):
+#     """
+#     3D plot: X=diffusion_steps, Y=generation_speed(s/sample), Z=CER.
+#     Also produces a 2D summary plot.
+#     """
+#     try:
+#         import matplotlib.pyplot as plt
+#         from mpl_toolkits.mplot3d import Axes3D
+#     except ImportError:
+#         print("pip install matplotlib.")
+#         return
+#
+#     T_list    = sorted(results.keys())
+#     cers      = [results[T]["mean_cer"] for T in T_list]
+#     speeds    = [results[T]["speed_per_sample"] for T in T_list]
+#
+#     # ── 3D plot ───────────────────────────────────────────────────────
+#     fig = plt.figure(figsize=(14, 5))
+#
+#     ax3d = fig.add_subplot(121, projection='3d')
+#     ax3d.scatter(T_list, speeds, cers, c=cers, cmap='RdYlGn_r', s=80)
+#     for T, s, c in zip(T_list, speeds, cers):
+#         ax3d.text(T, s, c, f"T={T}", fontsize=8)
+#     ax3d.set_xlabel("Diffusion steps T", fontsize=9)
+#     ax3d.set_ylabel("Speed (s/sample)", fontsize=9)
+#     ax3d.set_zlabel("CER (↓ better)", fontsize=9)
+#     ax3d.set_title("T vs speed vs CER", fontsize=10)
+#
+#     # ── 2D CER vs T (find the knee) ──────────────────────────────────
+#     ax2d = fig.add_subplot(122)
+#     ax2d.plot(T_list, cers, 'o-', linewidth=1.8, color='coral', markersize=7)
+#     for T, c in zip(T_list, cers):
+#         ax2d.annotate(f"{c:.3f}", (T, c), textcoords="offset points",
+#                       xytext=(0, 8), fontsize=8, ha='center')
+#
+#     # Find knee: largest CER drop per unit T (elbow method)
+#     if len(T_list) >= 3:
+#         drops  = [cers[i] - cers[i+1] for i in range(len(cers)-1)]
+#         knee_i = int(np.argmax(drops))
+#         knee_T = T_list[knee_i + 1]
+#         ax2d.axvline(knee_T, color='steelblue', linestyle='--', linewidth=1.2,
+#                      label=f"Knee at T={knee_T}")
+#         ax2d.legend(fontsize=9)
+#
+#     ax2d.set_xlabel("Diffusion steps T", fontsize=10)
+#     ax2d.set_ylabel("CER (lower = better)", fontsize=10)
+#     ax2d.set_title("CER vs diffusion steps", fontsize=10)
+#     ax2d.set_ylim(0, max(cers) * 1.1)
+#
+#     plt.tight_layout()
+#     if save_path:
+#         os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+#         plt.savefig(save_path, dpi=150, bbox_inches='tight')
+#         print(f"Saved: {save_path}")
+#     else:
+#         plt.show()
+#     plt.close()
+#
+#
+# # ── Adversarial robustness test (no retraining needed) ───────────────
+#
+# def corrupt_iast(text: str, corruption_rate: float = 0.05) -> str:
+#     """
+#     Introduce random corruption into IAST text:
+#       - Character swap (adjacent chars swapped)
+#       - Character deletion
+#       - Random character insertion
+#
+#     Models rate as 5% to 20% corruption to test robustness.
+#     """
+#     import random
+#     chars = list(text)
+#     n_corrupt = max(1, int(len(chars) * corruption_rate))
+#
+#     for _ in range(n_corrupt):
+#         op  = random.choice(['swap', 'delete', 'insert'])
+#         pos = random.randint(0, len(chars) - 1)
+#
+#         if op == 'swap' and pos < len(chars) - 1:
+#             chars[pos], chars[pos+1] = chars[pos+1], chars[pos]
+#         elif op == 'delete' and len(chars) > 1:
+#             chars.pop(pos)
+#         elif op == 'insert':
+#             chars.insert(pos, random.choice('abcdeimnostu'))
+#
+#     return "".join(chars)
+#
+#
+# @torch.no_grad()
+# def run_adversarial_test(
+#     model,
+#     src_tokenizer,
+#     tgt_tokenizer,
+#     test_inputs:    List[str],
+#     test_refs:      List[str],
+#     corruption_rates: List[float] = [0.0, 0.05, 0.10, 0.15, 0.20],
+#     device:         torch.device  = None,
+#     output_dir:     str           = "analysis/outputs",
+# ) -> Dict:
+#     """
+#     Test if CER degrades proportionally with IAST corruption.
+#     Uses existing trained model — no retraining.
+#     """
+#     device = device or next(model.parameters()).device
+#     results = {}
+#
+#     print("\nAdversarial robustness test...")
+#     for rate in corruption_rates:
+#         cer_list = []
+#         for text, ref in zip(test_inputs, test_refs):
+#             corrupted = corrupt_iast(text, rate)
+#             ids       = src_tokenizer.encode(corrupted)
+#             src       = torch.tensor([ids], dtype=torch.long, device=device)
+#
+#             if hasattr(model.model, 'generate_cached'):
+#                 out = model.model.generate_cached(src)
+#             else:
+#                 out = model.generate(src)
+#
+#             pred_ids = [x for x in out[0].tolist() if x > 4]
+#             pred     = tgt_tokenizer.decode(pred_ids).strip()
+#             cer_list.append(compute_cer(pred, ref))
+#
+#         mean_cer = float(np.mean(cer_list))
+#         results[rate] = mean_cer
+#         print(f"  corruption={rate*100:.0f}%  →  CER={mean_cer:.4f}")
+#
+#     # Save + plot
+#     os.makedirs(output_dir, exist_ok=True)
+#     try:
+#         import matplotlib.pyplot as plt
+#         fig, ax = plt.subplots(figsize=(8, 4))
+#         rates   = [r * 100 for r in corruption_rates]
+#         cers    = [results[r] for r in corruption_rates]
+#         ax.plot(rates, cers, 'o-', linewidth=1.8, color='steelblue', markersize=7)
+#         ax.set_xlabel("IAST corruption rate (%)", fontsize=11)
+#         ax.set_ylabel("CER", fontsize=11)
+#         ax.set_title("Model robustness to IAST input corruption", fontsize=11)
+#         ax.set_ylim(0, max(cers) * 1.2)
+#         plt.tight_layout()
+#         plt.savefig(os.path.join(output_dir, "adversarial_robustness.png"),
+#                     dpi=150, bbox_inches='tight')
+#         plt.close()
+#         print(f"  Saved: {output_dir}/adversarial_robustness.png")
+#     except ImportError:
+#         pass
+#
+#     with open(os.path.join(output_dir, "adversarial_results.json"), "w") as f:
+#         json.dump({str(k): v for k, v in results.items()}, f, indent=2)
+#
+#     return results
+"""
+analysis/task4_pipeline.py
+================================
+Correct Task 4 Pipeline:
+PHASE 1 → Evaluate all models
+PHASE 2 → Analyze + detect optimal T
+NO early decision making.
+"""
+import torch
+import numpy as np
+import time
+import os
+import json
+from typing import Dict, List
+# ───────────────────────��─────────────────────
+# Load Metrics
+# ─────────────────────────────────────────────
+def load_metrics():
+    from bert_score import score as bert_score
+    from sentence_transformers import SentenceTransformer, util
+    from nltk.translate.bleu_score import sentence_bleu
+    st_model = SentenceTransformer('all-MiniLM-L6-v2')
+    return bert_score, st_model, util, sentence_bleu
+# ─────────────────────────────────────────────
+# PHASE 1 — Evaluate ALL models
+# ─────────────────────────────────────────────
+def evaluate_all_models(models: Dict[int, object],
+                        src_list,
+                        ref_list,
+                        tgt_tokenizer,
+                        n_samples=200):
+    bert_score_fn, st_model, util, bleu_fn = load_metrics()
+    results = {}
+    print("\n=== PHASE 1: Evaluating ALL models ===")
+    for T, model in sorted(models.items()):
+        print(f"\nEvaluating T={T}...")
+        device = next(model.parameters()).device
+        preds, refs = [], []
+        start = time.perf_counter()
+        for src, ref in zip(src_list[:n_samples], ref_list[:n_samples]):
+            if src.dim() == 1:
+                src = src.unsqueeze(0)
+            with torch.no_grad():
+                out = model.model.generate_cached(src.to(device))
+            ids = [x for x in out[0].tolist() if x > 4]
+            pred = tgt_tokenizer.decode(ids).strip()
+            preds.append(pred)
+            refs.append(ref)
+        elapsed = time.perf_counter() - start
+        # BERTScore
+        P, R, F1 = bert_score_fn(preds, refs, lang="hi", verbose=False)
+        bert_f1 = float(F1.mean())
+        # Sentence similarity
+        emb_p = st_model.encode(preds, convert_to_tensor=True)
+        emb_r = st_model.encode(refs, convert_to_tensor=True)
+        sim = util.cos_sim(emb_p, emb_r).diagonal().mean().item()
+        # BLEU
+        bleu_scores = [
+            bleu_fn([r.split()], p.split())
+            for p, r in zip(preds, refs)
+        ]
+        results[T] = {
+            "bertscore_f1": bert_f1,
+            "semantic_sim": sim,
+            "bleu": float(np.mean(bleu_scores)),
+            "speed_per_sample": elapsed / n_samples
+        }
+        print(f"  BERTScore: {bert_f1:.4f}")
+        print(f"  Sim: {sim:.4f}")
+        print(f"  BLEU: {results[T]['bleu']:.4f}")
+        print(f"  Speed: {results[T]['speed_per_sample']:.4f}s")
+    # Save raw results
+    os.makedirs("analysis/outputs", exist_ok=True)
+    with open("analysis/outputs/task4_raw_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+    return results
+# ─────────────────────────────────────────────
+# PHASE 2 — Analyze results (Knee Detection)
+# ─────────────────────────────────────────────
+def analyze_results(results: Dict):
+    print("\n=== PHASE 2: Analysis ===")
+    T_list = sorted(results.keys())
+    scores = [results[T]["bertscore_f1"] for T in T_list]
+    gains = [scores[i+1] - scores[i] for i in range(len(scores)-1)]
+    print("\nMarginal Gains:")
+    for i, g in enumerate(gains):
+        print(f"  T{T_list[i]} → T{T_list[i+1]}: +{g:.4f}")
+    # Knee detection
+    threshold = 0.02
+    knee_T = T_list[-1]
+    for i, g in enumerate(gains):
+        if g < threshold:
+            knee_T = T_list[i+1]
+            break
+    print(f"\n✅ Optimal T (knee detected): {knee_T}")
+    return knee_T, gains
+# ─────────────────────────────────────────────
+# 3D Plot (BERTScore)
+# ─────────────────────────────────────────────
+def plot_3d(results):
+    import matplotlib.pyplot as plt
+    from mpl_toolkits.mplot3d import Axes3D
+    T_list = sorted(results.keys())
+    X = T_list
+    Y = [results[T]["speed_per_sample"] for T in T_list]
+    Z = [results[T]["bertscore_f1"] for T in T_list]
+    fig = plt.figure(figsize=(10, 6))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.scatter(X, Y, Z)
+    for x, y, z in zip(X, Y, Z):
+        ax.text(x, y, z, f"T={x}", fontsize=8)
+    ax.set_xlabel("Diffusion Steps")
+    ax.set_ylabel("Speed")
+    ax.set_zlabel("BERTScore")
+    plt.title("3D Tradeoff: Steps vs Speed vs Quality")
+    os.makedirs("analysis/outputs", exist_ok=True)
+    plt.savefig("analysis/outputs/task4_3d.png")
+    plt.close()
+    print("Saved 3D plot")
+# ────────────���────────────────────────────────
+# FINAL RUNNER
+# ─────────────────────────────────────────────
+def run_task4(models, src_list, ref_list, tgt_tokenizer):
+    # Phase 1: Evaluate all
+    results = evaluate_all_models(
+        models, src_list, ref_list, tgt_tokenizer
+    )
+    # Phase 2: Analyze
+    knee_T, gains = analyze_results(results)
+    # Plot
+    plot_3d(results)
+    # Save report
+    with open("analysis/outputs/task4_report.txt", "w") as f:
+        f.write(f"Optimal diffusion steps = {knee_T}\n")
+    return knee_T

app.py CHANGED Viewed

@@ -1,235 +1,547 @@
-"""
-Hugging Face Space app for Sanskrit D3PM project.
-Deploy on Spaces with:
-  app_file = app_hf_space.py
-Optional environment variables:
-  HF_CHECKPOINT_REPO   : model repo id (e.g. "username/sanskrit-d3pm")
-  HF_CHECKPOINT_FILE   : checkpoint path in repo (default: "best_model.pt")
-  HF_CHECKPOINT_LABEL  : UI label for remote checkpoint
-"""
-from __future__ import annotations
 import copy
 import os
-from typing import Dict, Tuple
 import gradio as gr
 import torch
 from config import CONFIG
 from inference import _build_tokenizers, _resolve_device, load_model, run_inference
-def _clean_output(text: str, max_repeat: int = 2) -> str:
-    text = " ".join(text.split())
-    if not text:
-        return text
-    toks = text.split()
-    out = []
-    prev = None
-    run = 0
-    for t in toks:
-        if t == prev:
-            run += 1
-        else:
-            prev = t
-            run = 1
-        if run <= max_repeat:
-            out.append(t)
-    s = " ".join(out)
-    s = s.replace(" ।", "।").replace(" ॥", "॥")
-    return " ".join(s.split())
-def _discover_local_checkpoints() -> Dict[str, str]:
-    found = {}
     for root in ("ablation_results", "results7", "results"):
         if not os.path.isdir(root):
             continue
-        for exp in sorted(os.listdir(root)):
-            ckpt = os.path.join(root, exp, "best_model.pt")
-            if os.path.exists(ckpt):
-                found[f"{exp} [{root}]"] = ckpt
     return found
-def _discover_remote_checkpoint() -> Dict[str, str]:
-    repo = os.getenv("HF_CHECKPOINT_REPO", "").strip()
-    if not repo:
-        return {}
-    filename = os.getenv("HF_CHECKPOINT_FILE", "best_model.pt").strip()
-    label = os.getenv("HF_CHECKPOINT_LABEL", f"remote:{repo}")
-    try:
-        from huggingface_hub import hf_hub_download
-        ckpt_path = hf_hub_download(repo_id=repo, filename=filename)
-        return {label: ckpt_path}
-    except Exception as e:
-        print(f"[WARN] remote checkpoint download failed: {e}")
-        return {}
-def _infer_model_type(path: str) -> str:
-    p = path.lower()
-    if "d3pm_encoder_decoder" in p:
         return "d3pm_encoder_decoder"
-    if "baseline_cross_attention" in p:
         return "baseline_cross_attention"
-    if "baseline_encoder_decoder" in p:
         return "baseline_encoder_decoder"
-    return "d3pm_cross_attention"
-def _infer_neg(path: str) -> bool:
-    p = path.lower()
-    if "_neg_true" in p:
         return True
-    if "_neg_false" in p:
         return False
     return CONFIG["data"]["include_negative_examples"]
-class RuntimeStore:
-    def __init__(self):
-        self.loaded: Dict[str, Dict] = {}
-    def get(self, ckpt_label: str, ckpt_path: str) -> Dict:
-        if ckpt_label in self.loaded:
-            return self.loaded[ckpt_label]
-        cfg = copy.deepcopy(CONFIG)
-        cfg["model_type"] = _infer_model_type(ckpt_path)
-        cfg["data"]["include_negative_examples"] = _infer_neg(ckpt_path)
-        device = _resolve_device(cfg)
-        model, cfg = load_model(ckpt_path, cfg, device)
-        src_tok, tgt_tok = _build_tokenizers(cfg)
-        bundle = {
-            "label": ckpt_label,
-            "path": ckpt_path,
-            "cfg": cfg,
-            "device": str(device),
-            "model": model,
-            "src_tok": src_tok,
-            "tgt_tok": tgt_tok,
-        }
-        self.loaded[ckpt_label] = bundle
-        return bundle
-RUNTIME = RuntimeStore()
-CHECKPOINTS = {}
-CHECKPOINTS.update(_discover_local_checkpoints())
-CHECKPOINTS.update(_discover_remote_checkpoint())
-if not CHECKPOINTS:
-    CHECKPOINTS = {"No checkpoint found": ""}
-def load_checkpoint_ui(label: str) -> Tuple[Dict, str]:
-    if label not in CHECKPOINTS or not CHECKPOINTS[label]:
-        raise gr.Error("No valid checkpoint found. Upload/provide best_model.pt first.")
-    bundle = RUNTIME.get(label, CHECKPOINTS[label])
-    info = (
-        f"Loaded `{label}`\n"
-        f"- path: `{bundle['path']}`\n"
-        f"- model_type: `{bundle['cfg']['model_type']}`\n"
-        f"- device: `{bundle['device']}`\n"
-        f"- max_seq_len: `{bundle['cfg']['model']['max_seq_len']}`"
-    )
-    return bundle, info
-def generate_ui(
-    bundle: Dict,
-    text: str,
-    temperature: float,
-    top_k: int,
-    repetition_penalty: float,
-    diversity_penalty: float,
-    num_steps: int,
-    clean_output: bool,
-) -> str:
-    if not bundle:
-        raise gr.Error("Load a checkpoint first.")
-    if not text.strip():
-        raise gr.Error("Enter input text.")
-    cfg = copy.deepcopy(bundle["cfg"])
     cfg["inference"]["temperature"] = float(temperature)
     cfg["inference"]["top_k"] = int(top_k)
     cfg["inference"]["repetition_penalty"] = float(repetition_penalty)
     cfg["inference"]["diversity_penalty"] = float(diversity_penalty)
     cfg["inference"]["num_steps"] = int(num_steps)
-    src_tok = bundle["src_tok"]
-    tgt_tok = bundle["tgt_tok"]
-    device = torch.device(bundle["device"])
-    ids = torch.tensor([src_tok.encode(text.strip())], dtype=torch.long, device=device)
-    out = run_inference(bundle["model"], ids, cfg)
-    token_ids = [x for x in out[0].tolist() if x > 4]
-    pred = tgt_tok.decode(token_ids).strip()
-    if clean_output:
-        pred = _clean_output(pred)
-    return pred if pred else "(empty output)"
-with gr.Blocks(title="Sanskrit D3PM Space") as demo:
     model_state = gr.State(None)
     gr.Markdown(
         """
-## Sanskrit D3PM Paraphrase (IAST → Devanagari)
-Load a trained checkpoint and generate output from Roman/IAST Sanskrit input.
 """
     )
-    checkpoint = gr.Dropdown(
-        choices=list(CHECKPOINTS.keys()),
-        value=list(CHECKPOINTS.keys())[0],
-        label="Checkpoint",
-    )
-    load_btn = gr.Button("Load Model", variant="primary")
-    load_info = gr.Markdown("Select a checkpoint and click **Load Model**.")
-    text_in = gr.Textbox(label="Input (Roman / IAST)", lines=3, value="dharmo rakṣati rakṣitaḥ")
-    text_out = gr.Textbox(label="Output (Devanagari)", lines=6)
     with gr.Row():
-        temperature = gr.Slider(0.4, 1.2, value=0.70, step=0.05, label="Temperature")
-        top_k = gr.Slider(5, 100, value=40, step=1, label="Top-K")
-        repetition_penalty = gr.Slider(1.0, 3.0, value=1.20, step=0.05, label="Repetition Penalty")
-        diversity_penalty = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Diversity Penalty")
-        num_steps = gr.Slider(1, 128, value=64, step=1, label="Inference Steps")
-        clean_output = gr.Checkbox(value=True, label="Clean Output")
-    generate_btn = gr.Button("Generate", variant="primary")
-    load_btn.click(load_checkpoint_ui, inputs=[checkpoint], outputs=[model_state, load_info])
     generate_btn.click(
-        generate_ui,
         inputs=[
-            model_state, text_in, temperature, top_k, repetition_penalty,
-            diversity_penalty, num_steps, clean_output
         ],
-        outputs=[text_out],
     )
-    text_in.submit(
-        generate_ui,
         inputs=[
-            model_state, text_in, temperature, top_k, repetition_penalty,
-            diversity_penalty, num_steps, clean_output
         ],
-        outputs=[text_out],
     )
 if __name__ == "__main__":
-    port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
     demo.launch(server_name="0.0.0.0", server_port=port, share=False)

 import copy
+import json
 import os
+import subprocess
+import sys
+from datetime import datetime
 import gradio as gr
 import torch
+from huggingface_hub import hf_hub_download, list_repo_files
 from config import CONFIG
 from inference import _build_tokenizers, _resolve_device, load_model, run_inference
+RESULTS_DIR = "generated_results"
+DEFAULT_ANALYSIS_OUT = "analysis/outputs"
+os.makedirs(RESULTS_DIR, exist_ok=True)
+def discover_checkpoints():
+    found = []
     for root in ("ablation_results", "results7", "results"):
         if not os.path.isdir(root):
             continue
+        for entry in sorted(os.listdir(root)):
+            ckpt = os.path.join(root, entry, "best_model.pt")
+            if not os.path.exists(ckpt):
+                continue
+            found.append(
+                {
+                    "label": f"{entry}  [{root}]",
+                    "path": ckpt,
+                    "experiment": entry,
+                    "root": root,
+                }
+            )
+    repo = os.getenv("HF_CHECKPOINT_REPO", "").strip()
+    if repo:
+        branch = os.getenv("HF_CHECKPOINT_REVISION", "main").strip() or "main"
+        try:
+            for fname in list_repo_files(repo_id=repo, repo_type="model", revision=branch):
+                if not fname.endswith("/best_model.pt") and fname != "best_model.pt":
+                    continue
+                local_path = hf_hub_download(repo_id=repo, filename=fname, revision=branch, repo_type="model")
+                parent = os.path.basename(os.path.dirname(fname)) if "/" in fname else "remote"
+                root = os.path.dirname(fname).split("/")[0] if "/" in fname else "remote"
+                found.append(
+                    {
+                        "label": f"{parent}  [hf:{repo}]",
+                        "path": local_path,
+                        "experiment": parent,
+                        "root": root,
+                    }
+                )
+        except Exception as e:
+            print(f"[WARN] Could not discover remote checkpoints from {repo}: {e}")
     return found
+def checkpoint_map():
+    return {item["label"]: item for item in discover_checkpoints()}
+def default_checkpoint_label():
+    cps = discover_checkpoints()
+    if not cps:
+        return None
+    for item in cps:
+        if item["path"].endswith("ablation_results/T4/best_model.pt"):
+            return item["label"]
+    return cps[0]["label"]
+def infer_model_type(experiment_name: str, root: str = "") -> str:
+    if root == "ablation_results":
+        return "d3pm_cross_attention"
+    if experiment_name.startswith("d3pm_cross_attention"):
+        return "d3pm_cross_attention"
+    if experiment_name.startswith("d3pm_encoder_decoder"):
         return "d3pm_encoder_decoder"
+    if experiment_name.startswith("baseline_cross_attention"):
         return "baseline_cross_attention"
+    if experiment_name.startswith("baseline_encoder_decoder"):
         return "baseline_encoder_decoder"
+    return CONFIG["model_type"]
+def infer_include_negative(experiment_name: str, root: str = "") -> bool:
+    if root == "ablation_results":
+        return False
+    if "_neg_True" in experiment_name:
         return True
+    if "_neg_False" in experiment_name:
         return False
     return CONFIG["data"]["include_negative_examples"]
+def build_runtime_cfg(ckpt_path: str):
+    experiment = os.path.basename(os.path.dirname(ckpt_path)) or "remote"
+    root = os.path.basename(os.path.dirname(os.path.dirname(ckpt_path))) or "remote"
+    cfg = copy.deepcopy(CONFIG)
+    cfg["model_type"] = infer_model_type(experiment, root=root)
+    cfg["data"]["include_negative_examples"] = infer_include_negative(experiment, root=root)
+    if root == "ablation_results" and experiment.startswith("T") and experiment[1:].isdigit():
+        t_val = int(experiment[1:])
+        cfg["model"]["diffusion_steps"] = t_val
+        cfg["inference"]["num_steps"] = t_val
+    device = _resolve_device(cfg)
+    return cfg, device, experiment
+def load_selected_model(checkpoint_label):
+    mapping = checkpoint_map()
+    if checkpoint_label not in mapping:
+        raise gr.Error("Selected checkpoint not found. Click refresh.")
+    ckpt_path = mapping[checkpoint_label]["path"]
+    cfg, device, experiment = build_runtime_cfg(ckpt_path)
+    model, cfg = load_model(ckpt_path, cfg, device)
+    src_tok, tgt_tok = _build_tokenizers(cfg)
+    bundle = {
+        "ckpt_path": ckpt_path,
+        "experiment": experiment,
+        "device": str(device),
+        "cfg": cfg,
+        "model": model,
+        "src_tok": src_tok,
+        "tgt_tok": tgt_tok,
+    }
+    model_info = {
+        "checkpoint": ckpt_path,
+        "experiment": experiment,
+        "model_type": cfg["model_type"],
+        "include_negatives": cfg["data"]["include_negative_examples"],
+        "device": str(device),
+        "max_seq_len": cfg["model"]["max_seq_len"],
+        "diffusion_steps": cfg["model"]["diffusion_steps"],
+        "inference_steps": cfg["inference"]["num_steps"],
+        "d_model": cfg["model"]["d_model"],
+        "n_layers": cfg["model"]["n_layers"],
+        "n_heads": cfg["model"]["n_heads"],
+    }
+    status = f"Loaded `{experiment}` on `{device}` (`{cfg['model_type']}`)"
+    suggested_out = os.path.join("analysis", "outputs_ui", experiment)
+    return bundle, status, model_info, cfg["inference"]["num_steps"], suggested_out
+def apply_preset(preset_name):
+    presets = {
+        "Manual": (0.70, 40, 1.20, 0.0),
+        "Literal": (0.60, 20, 1.25, 0.0),
+        "Balanced": (0.70, 40, 1.20, 0.0),
+        "Creative": (0.90, 80, 1.05, 0.2),
+    }
+    return presets.get(preset_name, presets["Balanced"])
+def clean_generated_text(text: str, max_consecutive: int = 2) -> str:
+    text = " ".join(text.split())
+    if not text:
+        return text
+    tokens = text.split()
+    cleaned = []
+    prev = None
+    run = 0
+    for tok in tokens:
+        if tok == prev:
+            run += 1
+        else:
+            prev = tok
+            run = 1
+        if run <= max_consecutive:
+            cleaned.append(tok)
+    out = " ".join(cleaned).replace(" ।", "।").replace(" ॥", "॥")
+    return " ".join(out.split())
+def save_generation(experiment, record):
+    ts = datetime.now().strftime("%Y%m%d")
+    path = os.path.join(RESULTS_DIR, f"{experiment}_ui_{ts}.json")
+    existing = []
+    if os.path.exists(path):
+        with open(path, "r", encoding="utf-8") as f:
+            existing = json.load(f)
+    existing.append(record)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(existing, f, ensure_ascii=False, indent=2)
+    return path
+def generate_from_ui(
+    model_bundle,
+    input_text,
+    temperature,
+    top_k,
+    repetition_penalty,
+    diversity_penalty,
+    num_steps,
+    clean_output,
+):
+    if not model_bundle:
+        raise gr.Error("Load a model first.")
+    if not input_text.strip():
+        raise gr.Error("Enter input text first.")
+    cfg = copy.deepcopy(model_bundle["cfg"])
     cfg["inference"]["temperature"] = float(temperature)
     cfg["inference"]["top_k"] = int(top_k)
     cfg["inference"]["repetition_penalty"] = float(repetition_penalty)
     cfg["inference"]["diversity_penalty"] = float(diversity_penalty)
     cfg["inference"]["num_steps"] = int(num_steps)
+    src_tok = model_bundle["src_tok"]
+    tgt_tok = model_bundle["tgt_tok"]
+    device = torch.device(model_bundle["device"])
+    input_ids = torch.tensor([src_tok.encode(input_text.strip())], dtype=torch.long, device=device)
+    out = run_inference(model_bundle["model"], input_ids, cfg)
+    # Align decode with validation style: strip only special ids.
+    pad_id = 1
+    mask_id = cfg["diffusion"]["mask_token_id"]
+    decoded_ids = [x for x in out[0].tolist() if x not in (pad_id, mask_id)]
+    raw_output_text = tgt_tok.decode(decoded_ids).strip()
+    output_text = clean_generated_text(raw_output_text) if clean_output else raw_output_text
+    if not output_text:
+        output_text = "(empty output)"
+    record = {
+        "timestamp": datetime.now().isoformat(timespec="seconds"),
+        "experiment": model_bundle["experiment"],
+        "checkpoint": model_bundle["ckpt_path"],
+        "input_text": input_text,
+        "raw_output_text": raw_output_text,
+        "output_text": output_text,
+        "temperature": float(temperature),
+        "top_k": int(top_k),
+        "repetition_penalty": float(repetition_penalty),
+        "diversity_penalty": float(diversity_penalty),
+        "num_steps": int(num_steps),
+        "clean_output": bool(clean_output),
+    }
+    log_path = save_generation(model_bundle["experiment"], record)
+    status = f"Inference done. Saved: `{log_path}`"
+    return output_text, status, record
+def _run_analysis_cmd(task, ckpt_path, output_dir, input_text="dharmo rakṣati rakṣitaḥ", phase="analyze"):
+    os.makedirs(output_dir, exist_ok=True)
+    cmd = [
+        sys.executable,
+        "analysis/run_analysis.py",
+        "--task",
+        str(task),
+        "--checkpoint",
+        ckpt_path,
+        "--output_dir",
+        output_dir,
+    ]
+    if str(task) == "2" or str(task) == "all":
+        cmd.extend(["--input", input_text])
+    if str(task) == "4":
+        cmd.extend(["--phase", phase])
+    env = os.environ.copy()
+    env.setdefault("HF_HOME", "/tmp/hf_home")
+    env.setdefault("HF_DATASETS_CACHE", "/tmp/hf_datasets")
+    env.setdefault("HF_HUB_CACHE", "/tmp/hf_hub")
+    proc = subprocess.run(cmd, capture_output=True, text=True, env=env)
+    log = f"$ {' '.join(cmd)}\n\n{proc.stdout}\n{proc.stderr}"
+    return proc.returncode, log
+def run_single_task(model_bundle, task, output_dir, input_text, task4_phase):
+    if not model_bundle:
+        raise gr.Error("Load a model first.")
+    code, log = _run_analysis_cmd(task, model_bundle["ckpt_path"], output_dir, input_text, task4_phase)
+    status = f"Task {task} {'completed' if code == 0 else 'failed'} (exit={code})."
+    return status, log
+def run_all_tasks(model_bundle, output_dir, input_text, task4_phase):
+    if not model_bundle:
+        raise gr.Error("Load a model first.")
+    logs = []
+    failures = 0
+    for task in ["1", "2", "3", "4", "5"]:
+        code, log = _run_analysis_cmd(task, model_bundle["ckpt_path"], output_dir, input_text, task4_phase)
+        logs.append(f"\n\n{'='*22} TASK {task} {'='*22}\n{log}")
+        if code != 0:
+            failures += 1
+    status = f"Run-all finished with {failures} failed task(s)." if failures else "All 5 tasks completed."
+    return status, "".join(logs)
+def _read_text(path):
+    if not os.path.exists(path):
+        return "Not found."
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
+def _img_or_none(path):
+    return path if os.path.exists(path) else None
+def refresh_task_outputs(output_dir):
+    task1_txt = _read_text(os.path.join(output_dir, "task1_kv_cache.txt"))
+    task2_txt = _read_text(os.path.join(output_dir, "task2_report.txt"))
+    task3_txt = _read_text(os.path.join(output_dir, "task3_report.txt"))
+    task5_txt = _read_text(os.path.join(output_dir, "task5_report.txt"))
+    task2_drift = _img_or_none(os.path.join(output_dir, "task2_semantic_drift.png"))
+    task2_attn = _img_or_none(os.path.join(output_dir, "task2_attn_t0.png"))
+    task3_space = _img_or_none(os.path.join(output_dir, "task3_concept_space.png"))
+    task4_plot = _img_or_none(os.path.join(output_dir, "task4_ablation_3d.png"))
+    return task1_txt, task2_txt, task2_drift, task2_attn, task3_txt, task3_space, task5_txt, task4_plot
+CUSTOM_CSS = """
+:root {
+  --bg1: #f5fbff;
+  --bg2: #f2f7ef;
+  --card: #ffffff;
+  --line: #d9e6f2;
+  --ink: #163048;
+}
+.gradio-container {
+  background: linear-gradient(130deg, var(--bg1), var(--bg2));
+  color: var(--ink);
+}
+#hero {
+  background: radial-gradient(110% 130% at 0% 0%, #d7ebff 0%, #ecf6ff 55%, #f8fbff 100%);
+  border: 1px solid #cfe0f1;
+  border-radius: 16px;
+  padding: 18px 20px;
+}
+.panel {
+  background: var(--card);
+  border: 1px solid var(--line);
+  border-radius: 14px;
+}
+"""
+with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
     model_state = gr.State(None)
     gr.Markdown(
         """
+<div id="hero">
+  <h1 style="margin:0;">Sanskrit Diffusion Client Demo</h1>
+  <p style="margin:.5rem 0 0 0;">
+    Select any trained model, run all 5 analysis tasks or individual tasks, then test inference with user-controlled parameters.
+  </p>
+</div>
 """
     )
     with gr.Row():
+        with gr.Column(scale=2, elem_classes=["panel"]):
+            checkpoint_dropdown = gr.Dropdown(
+                label="Model Checkpoint",
+                choices=list(checkpoint_map().keys()),
+                value=default_checkpoint_label(),
+                interactive=True,
+            )
+        with gr.Column(scale=1, elem_classes=["panel"]):
+            refresh_btn = gr.Button("Refresh Models")
+            load_btn = gr.Button("Load Selected Model", variant="primary")
+    load_status = gr.Markdown("Select a model and load.")
+    model_info = gr.JSON(label="Loaded Model Details")
+    with gr.Tabs():
+        with gr.Tab("1) Task Runner"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    analysis_output_dir = gr.Textbox(
+                        label="Analysis Output Directory",
+                        value=DEFAULT_ANALYSIS_OUT,
+                    )
+                    analysis_input = gr.Textbox(
+                        label="Task 2 Input Text",
+                        value="dharmo rakṣati rakṣitaḥ",
+                        lines=2,
+                    )
+                with gr.Column(scale=1):
+                    task4_phase = gr.Dropdown(
+                        choices=["analyze", "generate_configs"],
+                        value="analyze",
+                        label="Task 4 Phase",
+                    )
+                    run_all_btn = gr.Button("Run All 5 Tasks", variant="primary")
+            with gr.Row():
+                task_choice = gr.Dropdown(
+                    choices=["1", "2", "3", "4", "5"],
+                    value="1",
+                    label="Single Task",
+                )
+                run_single_btn = gr.Button("Run Selected Task")
+                refresh_outputs_btn = gr.Button("Refresh Output Viewer")
+            task_run_status = gr.Markdown("")
+            task_run_log = gr.Textbox(label="Task Execution Log", lines=18, interactive=False)
+            with gr.Accordion("Task Outputs Viewer", open=True):
+                task1_box = gr.Textbox(label="Task 1 Report", lines=10, interactive=False)
+                task2_box = gr.Textbox(label="Task 2 Report", lines=10, interactive=False)
+                with gr.Row():
+                    task2_drift_img = gr.Image(label="Task2 Drift", type="filepath")
+                    task2_attn_img = gr.Image(label="Task2 Attention", type="filepath")
+                task3_box = gr.Textbox(label="Task 3 Report", lines=10, interactive=False)
+                task3_img = gr.Image(label="Task3 Concept Space", type="filepath")
+                task5_box = gr.Textbox(label="Task 5 Report", lines=10, interactive=False)
+                task4_img = gr.Image(label="Task4 3D Ablation Plot", type="filepath")
+        with gr.Tab("2) Inference Playground"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    input_text = gr.Textbox(
+                        label="Input (Roman / IAST)",
+                        lines=4,
+                        value="dharmo rakṣati rakṣitaḥ",
+                    )
+                    output_text = gr.Textbox(
+                        label="Output (Devanagari)",
+                        lines=7,
+                        interactive=False,
+                    )
+                    run_status = gr.Markdown("")
+                    run_record = gr.JSON(label="Inference Metadata")
+                with gr.Column(scale=1, elem_classes=["panel"]):
+                    preset = gr.Radio(["Manual", "Literal", "Balanced", "Creative"], value="Balanced", label="Preset")
+                    temperature = gr.Slider(0.4, 1.2, value=0.70, step=0.05, label="Temperature")
+                    top_k = gr.Slider(5, 100, value=40, step=1, label="Top-K")
+                    repetition_penalty = gr.Slider(1.0, 3.0, value=1.20, step=0.05, label="Repetition Penalty")
+                    diversity_penalty = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Diversity Penalty")
+                    num_steps = gr.Slider(1, 128, value=64, step=1, label="Inference Steps")
+                    clean_output = gr.Checkbox(value=True, label="Clean Output")
+                    generate_btn = gr.Button("Generate", variant="primary")
+            gr.Examples(
+                examples=[
+                    ["dharmo rakṣati rakṣitaḥ"],
+                    ["satyameva jayate"],
+                    ["yadā mano nivarteta viṣayebhyaḥ svabhāvataḥ"],
+                ],
+                inputs=[input_text],
+            )
+    def refresh_checkpoints():
+        choices = list(checkpoint_map().keys())
+        value = default_checkpoint_label() if choices else None
+        return gr.Dropdown(choices=choices, value=value)
+    refresh_btn.click(fn=refresh_checkpoints, outputs=[checkpoint_dropdown])
+    load_btn.click(
+        fn=load_selected_model,
+        inputs=[checkpoint_dropdown],
+        outputs=[model_state, load_status, model_info, num_steps, analysis_output_dir],
+    )
+    preset.change(
+        fn=apply_preset,
+        inputs=[preset],
+        outputs=[temperature, top_k, repetition_penalty, diversity_penalty],
+    )
     generate_btn.click(
+        fn=generate_from_ui,
         inputs=[
+            model_state,
+            input_text,
+            temperature,
+            top_k,
+            repetition_penalty,
+            diversity_penalty,
+            num_steps,
+            clean_output,
         ],
+        outputs=[output_text, run_status, run_record],
     )
+    input_text.submit(
+        fn=generate_from_ui,
         inputs=[
+            model_state,
+            input_text,
+            temperature,
+            top_k,
+            repetition_penalty,
+            diversity_penalty,
+            num_steps,
+            clean_output,
+        ],
+        outputs=[output_text, run_status, run_record],
+    )
+    run_single_btn.click(
+        fn=run_single_task,
+        inputs=[model_state, task_choice, analysis_output_dir, analysis_input, task4_phase],
+        outputs=[task_run_status, task_run_log],
+    )
+    run_all_btn.click(
+        fn=run_all_tasks,
+        inputs=[model_state, analysis_output_dir, analysis_input, task4_phase],
+        outputs=[task_run_status, task_run_log],
+    )
+    refresh_outputs_btn.click(
+        fn=refresh_task_outputs,
+        inputs=[analysis_output_dir],
+        outputs=[
+            task1_box,
+            task2_box,
+            task2_drift_img,
+            task2_attn_img,
+            task3_box,
+            task3_img,
+            task5_box,
+            task4_img,
+        ],
+    )
+    demo.load(
+        fn=refresh_task_outputs,
+        inputs=[analysis_output_dir],
+        outputs=[
+            task1_box,
+            task2_box,
+            task2_drift_img,
+            task2_attn_img,
+            task3_box,
+            task3_img,
+            task5_box,
+            task4_img,
         ],
     )
 if __name__ == "__main__":
+    port = int(os.environ["GRADIO_SERVER_PORT"]) if "GRADIO_SERVER_PORT" in os.environ else None
     demo.launch(server_name="0.0.0.0", server_port=port, share=False)

data/__init__.py ADDED Viewed

File without changes

data/dataset.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+dataset.py  — Cross-Script Translation Fix
+==========================================
+INPUT  : quote_text       (Roman/IAST transliteration of Sanskrit)
+TARGET : quote_devanagari (Devanagari script)
+This is the CORRECT task: the model learns to transliterate / translate
+Roman Sanskrit → Devanagari, which is a meaningful, learnable mapping
+(far better than devanagari→devanagari reconstruction which teaches nothing).
+KEY CHANGES from original:
+  1. _input_field  = 'quote_text'        (was 'quote_devanagari')
+  2. _target_field = 'quote_devanagari'  (unchanged)
+  3. Separate source/target tokenizers — Roman and Devanagari have
+     completely different character sets; a shared BPE vocab forces the
+     model to learn both scripts in one embedding table, which wastes
+     capacity and confuses the attention mechanism.
+  4. Negative example generation fixed — reversal now operates on
+     DEVANAGARI target only (not accidentally on Roman source).
+  5. curriculum_sort uses target length (Devanagari) for difficulty proxy.
+"""
+from datasets import load_dataset
+from torch.utils.data import Dataset
+import torch
+import torch.nn.functional as F
+import random
+class OptimizedSanskritDataset(Dataset):
+    def __init__(self, split='train', tokenizer=None, max_len=80, cfg=None,
+                 src_tokenizer=None, tgt_tokenizer=None):
+        """
+        Args:
+            tokenizer     : shared tokenizer (legacy — used if src/tgt not provided)
+            src_tokenizer : tokenizer for quote_text  (Roman script)
+            tgt_tokenizer : tokenizer for quote_devanagari (Devanagari script)
+                            If None, falls back to shared `tokenizer`.
+        """
+        from config import CONFIG
+        self.cfg = cfg or CONFIG
+        self.max_len = max_len
+        self.pad_id  = 1
+        self.mask_id = self.cfg['diffusion']['mask_token_id']
+        self.include_negatives = self.cfg['data']['include_negative_examples']
+        # ── Tokenizer setup ───────────────────────────────────────────
+        # Support both legacy (shared) and new (separate src/tgt) tokenizers
+        self.src_tokenizer = src_tokenizer or tokenizer
+        self.tgt_tokenizer = tgt_tokenizer or tokenizer
+        if self.src_tokenizer is None:
+            raise ValueError("Provide at least one tokenizer.")
+        print(f"📥 Loading '{split}' split …")
+        raw = load_dataset("paws/sanskrit-verses-gretil", split=split)
+        cols = raw.column_names
+        # ── Field selection ───────────────────────────────────────────
+        if 'quote_text' in cols and 'quote_devanagari' in cols:
+            # CORRECT setup: Roman input → Devanagari output
+            self._input_field  = 'quote_text'
+            self._target_field = 'quote_devanagari'
+            print("   Format: quote_text (Roman) → quote_devanagari (Devanagari) ✓")
+        elif 'sentence1' in cols and 'sentence2' in cols:
+            # PAWS paraphrase pairs fallback
+            self._input_field  = 'sentence1'
+            self._target_field = 'sentence2'
+            print("   Format: PAWS sentence pairs ✓")
+        else:
+            # Last resort: same field both sides
+            self._input_field  = 'quote_devanagari'
+            self._target_field = 'quote_devanagari'
+            print("   ⚠️  Format: Devanagari→Devanagari (suboptimal — no quote_text found)")
+        # ── Filter empty rows ─────────────────────────────────────────
+        # Some rows have empty quote_text — skip them
+        raw = raw.filter(
+            lambda ex: (
+                bool(ex[self._input_field].strip()) and
+                bool(ex[self._target_field].strip())
+            )
+        )
+        print(f"   After empty-filter: {len(raw)} samples")
+        self.dataset = raw
+        if split == 'train':
+            self.dataset = self._curriculum_sort()
+        print(f"✅ {len(self.dataset)} samples loaded.")
+    # ── Encoding ──────────────────────────────────────────────────────
+    def _encode_src(self, text):
+        """Encode source (Roman) text."""
+        ids = self.src_tokenizer.encode(text)[:self.max_len]
+        t   = torch.tensor(ids, dtype=torch.long)
+        t   = F.pad(t, (0, max(0, self.max_len - len(t))), value=self.pad_id)
+        return t
+    def _encode_tgt(self, text):
+        """Encode target (Devanagari) text."""
+        ids = self.tgt_tokenizer.encode(text)[:self.max_len]
+        t   = torch.tensor(ids, dtype=torch.long)
+        t   = F.pad(t, (0, max(0, self.max_len - len(t))), value=self.pad_id)
+        return t
+    # ── Curriculum ───��────────────────────────────────────────────────
+    def _curriculum_sort(self):
+        """Short, common Devanagari targets first → long, rare targets last."""
+        scores = []
+        for s in self.dataset:
+            text         = s[self._target_field]
+            length       = len(text.split())
+            rarity_score = len(set(text)) / max(1, len(text))
+            scores.append(length * (1 - rarity_score))
+        order = sorted(range(len(self.dataset)), key=lambda i: scores[i])
+        return self.dataset.select(order)
+    # ── Item ──────────────────────────────────────────────────────────
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        sample = self.dataset[idx]
+        src_text = sample[self._input_field].strip()
+        tgt_text = sample[self._target_field].strip()
+        input_ids  = self._encode_src(src_text)   # Roman encoded with src_tokenizer
+        target_ids = self._encode_tgt(tgt_text)   # Devanagari encoded with tgt_tokenizer
+        out = {
+            'input_ids':   input_ids,
+            'target_ids':  target_ids,
+            'input_text':  src_text,
+            'target_text': tgt_text,
+        }
+        if self.include_negatives:
+            neg_ids = target_ids.clone()
+            # Reverse a random chunk of the DEVANAGARI target
+            non_pad = (neg_ids != self.pad_id).sum().item()
+            if non_pad > 4:
+                i1, i2 = sorted(random.sample(range(non_pad), 2))
+                neg_ids[i1:i2] = torch.flip(neg_ids[i1:i2], dims=[0])
+            out['negative_target_ids'] = neg_ids
+        return out

requirements.txt CHANGED Viewed

@@ -4,3 +4,9 @@ numpy>=1.24
 tqdm>=4.66
 huggingface_hub==0.25.2
 tokenizers>=0.15

 tqdm>=4.66
 huggingface_hub==0.25.2
 tokenizers>=0.15
+datasets>=2.20
+scikit-learn>=1.4
+matplotlib>=3.8
+bert-score>=0.3.13
+sentence-transformers>=3.0
+nltk>=3.8