diff --git a/.gitattributes b/.gitattributes index 7f099627d2fb806db6a45fb661320188aa731e8f..a101dffce50756b6934b5404ad4079f3d41927f9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text analysis_outputs/T16/task3_concept_space.png filter=lfs diff=lfs merge=lfs -text analysis_outputs/T4/task3_concept_space.png filter=lfs diff=lfs merge=lfs -text analysis_outputs/T8/task3_concept_space.png filter=lfs diff=lfs merge=lfs -text +analysis_outputs/outputs_all_models_20260325/T32/task5_quality_diversity_tradeoff.png filter=lfs diff=lfs merge=lfs -text +analysis_outputs/outputs_all_models_20260325/T64/task5_quality_diversity_tradeoff.png filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 6b7947a98d6a15bbb3f64062cde1462b88a9111c..8239892d54da6e0e14d107981f970968d3b0c8f1 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,13 @@ Set these Space variables in **Settings → Variables and secrets**: - `HF_CHECKPOINT_REPO` = `/sanskrit-d3pm` - `HF_CHECKPOINT_FILE` = `best_model.pt` - `HF_CHECKPOINT_LABEL` = `main-model` (optional) +- `HF_DEFAULT_MODEL_TYPE` = `d3pm_cross_attention` or `d3pm_encoder_decoder` +- `HF_DEFAULT_INCLUDE_NEG` = `true` or `false` +- `HF_DEFAULT_NUM_STEPS` = checkpoint diffusion steps, for example `4`, `8`, `16` The app will download checkpoint from your model repo and load it at runtime. +If the model repo contains `model_settings.json`, the Space will use it +automatically and these variables become optional overrides. ### Optional MLflow Tracking in Space diff --git a/analysis/quality_classifier.py b/analysis/quality_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..1562577c4f54796f15ddfdef2dc97389aea0ea02 --- /dev/null +++ b/analysis/quality_classifier.py @@ -0,0 +1,885 @@ +# """ +# analysis/quality_classifier.py +# ================================ +# Task 5: Classifier-Free Guidance for Paraphrase Quality Control +# +# Two steps — only Step 2 requires training a SMALL model (not the main D3PM): +# +# STEP 1 — Collect training data (no training): +# Run existing model on val set, record (hidden_state, CER) pairs. +# Hidden states come from model.model._last_hidden after forward_cached(). +# CER score = quality label (lower CER = higher quality). +# +# STEP 2 — Train quality classifier: +# Small 2-layer MLP: d_model → 64 → 1 +# Input: pooled decoder hidden state [B, d_model] +# Output: predicted quality score in [0, 1] (1 = high quality) +# Loss: MSE against normalized CER labels +# Training time: ~5-10 minutes on CPU for 10k examples +# +# STEP 3 — Guided inference (no retraining): +# At each diffusion step, use classifier gradient to shift logits: +# guided_logits = logits + λ * ∂(quality_score)/∂(logits) +# Higher λ → model biased toward high-quality outputs +# λ=0 → standard generation (no guidance) +# +# Key: main D3PM model is FROZEN throughout. Only the 10k-param classifier trains. +# """ +# +# import torch +# import torch.nn as nn +# import torch.nn.functional as F +# import numpy as np +# import os +# import json +# from typing import List, Dict, Optional, Tuple +# +# +# # ── Quality classifier architecture ────────────────────────────────── +# +# class QualityClassifier(nn.Module): +# """ +# Lightweight MLP that predicts transliteration quality from decoder +# hidden states. +# +# Architecture: +# d_model → 128 → 64 → 1 → Sigmoid +# +# Input: mean-pooled decoder hidden state [B, d_model] +# Output: quality score [B, 1] ∈ [0, 1] (1 = high quality) +# +# ~10k parameters. Trains in minutes on CPU. +# """ +# def __init__(self, d_model: int): +# super().__init__() +# self.net = nn.Sequential( +# nn.Linear(d_model, 128), +# nn.ReLU(), +# nn.Dropout(0.1), +# nn.Linear(128, 64), +# nn.ReLU(), +# nn.Linear(64, 1), +# nn.Sigmoid(), +# ) +# self.d_model = d_model +# +# def forward(self, hidden: torch.Tensor) -> torch.Tensor: +# """ +# Args: +# hidden : [B, tgt_len, d_model] OR [B, d_model] (already pooled) +# +# Returns: +# score : [B, 1] quality score in [0, 1] +# """ +# if hidden.dim() == 3: +# # Pool over sequence length +# hidden = hidden.mean(dim=1) # [B, d_model] +# return self.net(hidden) # [B, 1] +# +# +# # ── Training data collection ────────────────────────────────────────── +# +# @torch.no_grad() +# def collect_quality_data( +# model, +# src_list: List[torch.Tensor], +# ref_list: List[str], +# tgt_tokenizer, +# t_capture: int = 0, +# temperature: float = 0.8, +# top_k: int = 40, +# max_samples: int = 5000, +# ) -> Tuple[np.ndarray, np.ndarray]: +# """ +# Collect (hidden_state, quality_score) pairs for classifier training. +# +# For each sample: +# 1. Run generate_cached() on src +# 2. Capture decoder hidden state at t=t_capture +# 3. Compute CER between output and reference +# 4. Quality = 1 - CER (normalize to [0,1]) +# +# Args: +# model : SanskritModel +# src_list : list of [1, src_len] tensors +# ref_list : list of reference Devanagari strings +# tgt_tokenizer : SanskritTargetTokenizer +# t_capture : which step to capture hidden states (0 = final) +# max_samples : cap number of training examples +# +# Returns: +# hidden_matrix : np.ndarray [N, d_model] +# quality_scores: np.ndarray [N] values in [0, 1] +# """ +# inner = model.model +# T = inner.scheduler.num_timesteps +# device = next(inner.parameters()).device +# +# hidden_list = [] +# quality_list = [] +# n = min(len(src_list), max_samples) +# +# def cer(pred, ref): +# if not ref: +# return 1.0 +# def ed(s1, s2): +# m, n = len(s1), len(s2) +# dp = list(range(n + 1)) +# for i in range(1, m + 1): +# prev, dp[0] = dp[0], i +# for j in range(1, n + 1): +# temp = dp[j] +# dp[j] = prev if s1[i-1] == s2[j-1] else 1 + min(prev, dp[j], dp[j-1]) +# prev = temp +# return dp[n] +# return ed(pred, ref) / max(len(ref), 1) +# +# print(f"Collecting quality data from {n} examples...") +# for i, (src, ref) in enumerate(zip(src_list[:n], ref_list[:n])): +# if i % 200 == 0: +# print(f" {i}/{n}") +# +# if src.dim() == 1: +# src = src.unsqueeze(0) +# src = src.to(device) +# +# B = src.shape[0] +# tgt_len = inner.max_seq_len +# mask_id = inner.mask_token_id +# +# memory, src_pad_mask = inner.encode_source(src) +# x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device) +# hint = None +# h_cap = None +# +# for t_val in range(T - 1, -1, -1): +# t = torch.full((B,), t_val, dtype=torch.long, device=device) +# is_last = (t_val == 0) +# +# logits, _ = inner.forward_cached( +# memory, src_pad_mask, x0_est, t, +# x0_hint=hint, inference_mode=True, +# ) +# +# if t_val == t_capture and hasattr(inner, '_last_hidden'): +# h_cap = inner._last_hidden[0].mean(dim=0).detach().cpu() # [d_model] +# +# logits = logits / max(temperature, 1e-8) +# if top_k > 0: +# V = logits.shape[-1] +# if top_k < V: +# vals, _ = torch.topk(logits, top_k, dim=-1) +# logits = logits.masked_fill(logits < vals[..., -1:], float('-inf')) +# +# probs = F.softmax(logits, dim=-1) +# x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs) +# hint = x0_est +# +# if h_cap is None: +# continue +# +# ids = [x for x in x0_est[0].tolist() if x > 4] +# pred = tgt_tokenizer.decode(ids).strip() +# q = max(0.0, 1.0 - cer(pred, ref)) # quality = 1 - CER +# +# hidden_list.append(h_cap.numpy()) +# quality_list.append(q) +# +# print(f"Collected {len(hidden_list)} quality examples.") +# print(f"Quality stats: mean={np.mean(quality_list):.3f} " +# f"min={np.min(quality_list):.3f} max={np.max(quality_list):.3f}") +# +# return np.stack(hidden_list), np.array(quality_list, dtype=np.float32) +# +# +# def _sample(probs): +# B, L, V = probs.shape +# flat = probs.view(B * L, V).clamp(min=1e-9) +# flat = flat / flat.sum(dim=-1, keepdim=True) +# return torch.multinomial(flat, 1).squeeze(-1).view(B, L) +# +# +# # ── Training ────────────────────────────────────────────────────────── +# +# def train_quality_classifier( +# hidden_matrix: np.ndarray, +# quality_scores: np.ndarray, +# d_model: int, +# epochs: int = 30, +# batch_size: int = 64, +# lr: float = 1e-3, +# val_frac: float = 0.1, +# save_path: Optional[str] = None, +# ) -> QualityClassifier: +# """ +# Train QualityClassifier on collected (hidden, quality) pairs. +# +# Args: +# hidden_matrix : [N, d_model] from collect_quality_data() +# quality_scores : [N] quality labels in [0, 1] +# d_model : hidden dimension +# epochs : training epochs +# save_path : if given, save trained classifier weights here +# +# Returns: +# trained QualityClassifier +# """ +# device = torch.device("cpu") # classifier is tiny, CPU is fine +# +# X = torch.tensor(hidden_matrix, dtype=torch.float32) +# y = torch.tensor(quality_scores, dtype=torch.float32).unsqueeze(-1) +# +# N = len(X) +# n_val = max(1, int(N * val_frac)) +# idx = torch.randperm(N) +# val_idx = idx[:n_val] +# train_idx = idx[n_val:] +# +# X_train, y_train = X[train_idx], y[train_idx] +# X_val, y_val = X[val_idx], y[val_idx] +# +# clf = QualityClassifier(d_model).to(device) +# optimizer = torch.optim.Adam(clf.parameters(), lr=lr) +# +# print(f"\nTraining QualityClassifier: {sum(p.numel() for p in clf.parameters())} params") +# print(f"Train: {len(X_train)} Val: {len(X_val)}") +# +# best_val_loss = float('inf') +# best_state = None +# +# for epoch in range(epochs): +# clf.train() +# perm = torch.randperm(len(X_train)) +# train_loss = 0.0 +# n_batches = 0 +# +# for start in range(0, len(X_train), batch_size): +# batch_idx = perm[start:start + batch_size] +# xb, yb = X_train[batch_idx], y_train[batch_idx] +# pred = clf(xb) +# loss = F.mse_loss(pred, yb) +# optimizer.zero_grad() +# loss.backward() +# optimizer.step() +# train_loss += loss.item() +# n_batches += 1 +# +# clf.eval() +# with torch.no_grad(): +# val_pred = clf(X_val) +# val_loss = F.mse_loss(val_pred, y_val).item() +# +# if epoch % 5 == 0 or epoch == epochs - 1: +# print(f" Ep {epoch+1:3d} train={train_loss/n_batches:.4f} val={val_loss:.4f}") +# +# if val_loss < best_val_loss: +# best_val_loss = val_loss +# best_state = {k: v.clone() for k, v in clf.state_dict().items()} +# +# if best_state: +# clf.load_state_dict(best_state) +# print(f" Best val loss: {best_val_loss:.4f}") +# +# if save_path: +# os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True) +# torch.save(clf.state_dict(), save_path) +# print(f" Classifier saved: {save_path}") +# +# return clf +# +# +# # ── Guided inference ────────────────────────────────────────────────── +# +# def generate_guided( +# model, +# src: torch.Tensor, +# classifier: QualityClassifier, +# guidance_scale: float = 1.0, +# temperature: float = 0.8, +# top_k: int = 40, +# ) -> torch.Tensor: +# """ +# Classifier-guided generation. +# +# At each diffusion step: +# 1. Run forward_cached() → logits, hidden states +# 2. Compute classifier gradient: ∂(quality_score) / ∂(hidden) +# 3. Project gradient back to logit space (approximate) +# 4. guided_logits = logits + λ * gradient_signal +# 5. Sample from guided_logits +# +# guidance_scale λ: +# 0.0 → no guidance (standard generation) +# 0.5 → weak guidance +# 1.0 → moderate guidance (recommended starting point) +# 2.0 → strong guidance (may reduce diversity) +# 3.0 → very strong (may collapse to repetitive output) +# +# Args: +# model : SanskritModel (frozen) +# src : [1, src_len] IAST token ids +# classifier : trained QualityClassifier +# guidance_scale : λ — guidance strength +# +# Returns: +# x0_est : [1, tgt_len] generated token ids +# """ +# inner = model.model +# T = inner.scheduler.num_timesteps +# device = next(inner.parameters()).device +# clf_device = next(classifier.parameters()).device +# +# if src.dim() == 1: +# src = src.unsqueeze(0) +# src = src.to(device) +# +# B = src.shape[0] +# tgt_len = inner.max_seq_len +# mask_id = inner.mask_token_id +# +# memory, src_pad_mask = inner.encode_source(src) +# x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device) +# hint = None +# +# inner.eval() +# classifier.eval() +# +# for t_val in range(T - 1, -1, -1): +# t = torch.full((B,), t_val, dtype=torch.long, device=device) +# is_last = (t_val == 0) +# +# if guidance_scale > 0.0: +# # Need gradients for classifier guidance +# with torch.enable_grad(): +# # Run forward_cached and get hidden states +# PAD = 1 +# if t_val > 0: +# _, x_t_ids = inner.forward_process.q_sample(x0_est, t) +# else: +# x_t_ids = x0_est +# +# x = inner.tgt_embed(x_t_ids) +# t_norm = t.float() / T +# t_emb = inner.time_mlp(t_norm.unsqueeze(-1)) +# x = x + t_emb.unsqueeze(1) +# +# if hint is not None: +# hint_emb = inner.tgt_embed(hint) +# gate = inner.hint_gate(x) +# x = x + gate * hint_emb +# +# for block in inner.decoder_blocks: +# x = block(x, memory, tgt_pad_mask=None, src_pad_mask=src_pad_mask) +# +# # hidden: [B, tgt_len, d_model] — detach from graph for clf +# hidden = x.detach().requires_grad_(True).to(clf_device) +# +# # Classifier quality score +# quality = classifier(hidden) # [B, 1] +# quality.sum().backward() +# +# # Gradient of quality w.r.t. hidden: [B, tgt_len, d_model] +# grad = hidden.grad.to(device) # [B, tgt_len, d_model] +# +# # Project gradient to logit space via output head weight +# # logit_grad ≈ grad @ head.weight [B, tgt_len, tgt_vocab] +# logit_grad = grad @ inner.head.weight.T +# +# # Compute standard logits (no gradient needed) +# with torch.no_grad(): +# logits = inner.head(x) +# +# # Apply guidance +# logits = logits + guidance_scale * logit_grad +# +# else: +# with torch.no_grad(): +# logits, _ = inner.forward_cached( +# memory, src_pad_mask, x0_est, t, +# x0_hint=hint, inference_mode=True, +# ) +# +# with torch.no_grad(): +# logits = logits / max(temperature, 1e-8) +# if top_k > 0: +# V = logits.shape[-1] +# if top_k < V: +# vals, _ = torch.topk(logits, top_k, dim=-1) +# logits = logits.masked_fill(logits < vals[..., -1:], float('-inf')) +# +# probs = F.softmax(logits, dim=-1) +# x0_est = torch.argmax(probs, dim=-1) if is_last else _sample_no_grad(probs) +# hint = x0_est +# +# return x0_est +# +# +# def _sample_no_grad(probs): +# B, L, V = probs.shape +# flat = probs.view(B * L, V).clamp(min=1e-9) +# flat = flat / flat.sum(dim=-1, keepdim=True) +# return torch.multinomial(flat, 1).squeeze(-1).view(B, L) +# +# +# # ── Guidance scale sweep ────────────────────────────────────────────── +# +# def sweep_guidance_scales( +# model, +# classifier: QualityClassifier, +# src_list: List[torch.Tensor], +# ref_list: List[str], +# tgt_tokenizer, +# scales: List[float] = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0], +# n_samples: int = 50, +# device: torch.device = None, +# output_dir: str = "analysis/outputs", +# ) -> Dict: +# """ +# Evaluate CER at each guidance scale. +# Produces quality-diversity tradeoff plot. +# """ +# def cer(pred, ref): +# if not ref: +# return 1.0 +# def ed(s1, s2): +# m, n = len(s1), len(s2) +# dp = list(range(n + 1)) +# for i in range(1, m + 1): +# prev, dp[0] = dp[0], i +# for j in range(1, n + 1): +# temp = dp[j] +# dp[j] = prev if s1[i-1] == s2[j-1] else 1 + min(prev, dp[j], dp[j-1]) +# prev = temp +# return dp[n] +# return ed(pred, ref) / max(len(ref), 1) +# +# device = device or next(model.parameters()).device +# results = {} +# n = min(n_samples, len(src_list)) +# +# print("\nGuidance scale sweep...") +# for scale in scales: +# cer_list = [] +# output_set = [] +# for src, ref in zip(src_list[:n], ref_list[:n]): +# if src.dim() == 1: +# src = src.unsqueeze(0) +# out = generate_guided(model, src.to(device), classifier, +# guidance_scale=scale) +# ids = [x for x in out[0].tolist() if x > 4] +# pred = tgt_tokenizer.decode(ids).strip() +# cer_list.append(cer(pred, ref)) +# output_set.append(pred) +# +# mean_cer = float(np.mean(cer_list)) +# +# # Self-diversity: unique outputs / total (proxy for diversity) +# unique_frac = len(set(output_set)) / max(len(output_set), 1) +# +# results[scale] = {"mean_cer": mean_cer, "diversity": unique_frac} +# print(f" λ={scale:.1f} CER={mean_cer:.4f} diversity={unique_frac:.3f}") +# +# # Plot +# os.makedirs(output_dir, exist_ok=True) +# try: +# import matplotlib.pyplot as plt +# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) +# +# sc_list = sorted(results.keys()) +# cers = [results[s]["mean_cer"] for s in sc_list] +# diversities = [results[s]["diversity"] for s in sc_list] +# +# ax1.plot(sc_list, cers, 'o-', color='coral', linewidth=1.8, markersize=7) +# ax1.set_xlabel("Guidance scale λ", fontsize=10) +# ax1.set_ylabel("CER (↓ better)", fontsize=10) +# ax1.set_title("Quality vs guidance scale", fontsize=10) +# +# ax2.plot(sc_list, diversities, 'o-', color='steelblue', linewidth=1.8, markersize=7) +# ax2.set_xlabel("Guidance scale λ", fontsize=10) +# ax2.set_ylabel("Output diversity (unique fraction)", fontsize=10) +# ax2.set_title("Diversity vs guidance scale", fontsize=10) +# +# plt.suptitle("Quality-Diversity Tradeoff (Guidance Scale Sweep)", fontsize=11) +# plt.tight_layout() +# path = os.path.join(output_dir, "guidance_scale_sweep.png") +# plt.savefig(path, dpi=150, bbox_inches='tight') +# plt.close() +# print(f" Saved: {path}") +# except ImportError: +# pass +# +# with open(os.path.join(output_dir, "guidance_results.json"), "w") as f: +# json.dump({str(k): v for k, v in results.items()}, f, indent=2) +# +# return results +import os +import json +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from typing import List, Dict +from itertools import combinations + + +class QualityClassifier(nn.Module): + def __init__(self, d_model: int): + super().__init__() + self.net = nn.Sequential( + nn.Linear(d_model, 128), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(128, 64), + nn.ReLU(), + nn.Linear(64, 1), + nn.Sigmoid(), + ) + + def forward(self, hidden): + if hidden.dim() == 3: + hidden = hidden.mean(dim=1) + return self.net(hidden) + + +def _cer(pred: str, ref: str) -> float: + m, n = len(pred), len(ref) + if m == 0 and n == 0: + return 0.0 + dp = list(range(n + 1)) + for i in range(1, m + 1): + prev, dp[0] = dp[0], i + for j in range(1, n + 1): + tmp = dp[j] + dp[j] = prev if pred[i - 1] == ref[j - 1] else 1 + min(prev, dp[j], dp[j - 1]) + prev = tmp + return float(dp[n]) / max(1, m, n) + + +def _sample(probs: torch.Tensor) -> torch.Tensor: + B, L, V = probs.shape + flat = probs.reshape(B * L, V).clamp(min=1e-9) + flat = flat / flat.sum(dim=-1, keepdim=True) + return torch.multinomial(flat, 1).squeeze(-1).reshape(B, L) + + +@torch.no_grad() +def _decode_pred(tgt_tokenizer, out_ids: torch.Tensor) -> str: + ids = [x for x in out_ids[0].tolist() if x > 4] + return tgt_tokenizer.decode(ids).strip() + + +def _tokenize_ws(text: str) -> list[str]: + return [t for t in text.split() if t] + + +def _distinct_n(outputs: List[str], n: int = 2) -> float: + ngrams = [] + for s in outputs: + toks = _tokenize_ws(s) + if len(toks) < n: + continue + ngrams.extend([tuple(toks[i:i+n]) for i in range(len(toks) - n + 1)]) + if not ngrams: + return 0.0 + return float(len(set(ngrams)) / max(1, len(ngrams))) + + +def _self_bleu(outputs: List[str], max_pairs: int = 64) -> float: + try: + from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction + except Exception: + return 0.0 + toks = [_tokenize_ws(s) for s in outputs if s.strip()] + if len(toks) < 2: + return 0.0 + smooth = SmoothingFunction().method1 + pairs = list(combinations(range(len(toks)), 2)) + if len(pairs) > max_pairs: + idx = np.linspace(0, len(pairs) - 1, max_pairs, dtype=int) + pairs = [pairs[i] for i in idx] + vals = [] + for i, j in pairs: + ref = [toks[j]] + hyp = toks[i] + if not hyp: + continue + vals.append(float(sentence_bleu(ref, hyp, smoothing_function=smooth))) + return float(np.mean(vals)) if vals else 0.0 + + +@torch.no_grad() +def collect_quality_data( + model, + src_list: List[torch.Tensor], + ref_list: List[str], + tgt_tokenizer, + t_capture: int = 0, + max_samples: int = 1000, +) -> tuple[np.ndarray, np.ndarray]: + inner = model.model + device = next(inner.parameters()).device + inner.eval() + + hidden_rows = [] + quality_rows = [] + + n = min(max_samples, len(src_list), len(ref_list)) + print(f"Collecting quality data from {n} examples...") + for i, (src, ref) in enumerate(zip(src_list[:n], ref_list[:n])): + if src.dim() == 1: + src = src.unsqueeze(0) + src = src.to(device) + + out = inner.generate_cached(src) if hasattr(inner, "generate_cached") else inner.generate(src) + pred = _decode_pred(tgt_tokenizer, out) + cer_q = 1.0 - _cer(pred, ref) + toks = [t for t in pred.split() if t] + uniq = len(set(toks)) / max(1, len(toks)) + len_ratio = min(1.0, len(toks) / max(1, len(ref.split()))) + # Blend quality target to avoid all-zero collapse on weak checkpoints. + quality = 0.70 * cer_q + 0.20 * uniq + 0.10 * len_ratio + + memory, src_pad = inner.encode_source(src) + t = torch.full((1,), int(t_capture), dtype=torch.long, device=device) + _ = inner.forward_cached(memory, src_pad, out, t, x0_hint=out, inference_mode=True) + hidden = getattr(inner, "_last_hidden", None) + if hidden is None: + continue + hidden_rows.append(hidden[0].mean(dim=0).detach().cpu().numpy()) + quality_rows.append(float(np.clip(quality, 0.0, 1.0))) + if i % 200 == 0: + print(f" {i}/{n}") + + if not hidden_rows: + raise RuntimeError("No hidden states collected for quality classifier.") + hidden_arr = np.asarray(hidden_rows, dtype=np.float32) + quality_arr = np.asarray(quality_rows, dtype=np.float32) + print(f"Collected {hidden_arr.shape[0]} quality examples.") + return hidden_arr, quality_arr + + +def train_quality_classifier( + hidden: np.ndarray, + quality: np.ndarray, + d_model: int, + epochs: int = 30, + batch_size: int = 64, + lr: float = 1e-3, + save_path: str | None = None, +): + device = torch.device("cpu") + clf = QualityClassifier(d_model).to(device) + + x = torch.tensor(hidden, dtype=torch.float32, device=device) + q = quality.astype(np.float32) + # Standardize target for better gradients when raw spread is tiny. + q_mu = float(np.mean(q)) + q_sd = float(np.std(q)) + if q_sd < 1e-4: + q = q + np.random.normal(0.0, 1e-3, size=q.shape).astype(np.float32) + q_mu = float(np.mean(q)) + q_sd = float(np.std(q)) + q = np.clip((q - q_mu) / max(q_sd, 1e-6), -3.0, 3.0) + y = torch.tensor(q, dtype=torch.float32, device=device).unsqueeze(-1) + + idx = torch.randperm(x.shape[0]) + split = int(0.9 * x.shape[0]) + tr, va = idx[:split], idx[split:] + + x_tr, y_tr = x[tr], y[tr] + x_va, y_va = x[va], y[va] + + opt = torch.optim.Adam(clf.parameters(), lr=lr) + loss_fn = nn.MSELoss() + best_val = float("inf") + best_state = None + + print(f"\nTraining QualityClassifier: {sum(p.numel() for p in clf.parameters())} params") + print(f"Train: {x_tr.shape[0]} Val: {x_va.shape[0]}") + for ep in range(1, epochs + 1): + clf.train() + ep_losses = [] + for i in range(0, x_tr.shape[0], batch_size): + xb = x_tr[i : i + batch_size] + yb = y_tr[i : i + batch_size] + pred = clf(xb) + loss = loss_fn(pred, yb) + opt.zero_grad(set_to_none=True) + loss.backward() + opt.step() + ep_losses.append(float(loss.item())) + tr_loss = float(np.mean(ep_losses)) if ep_losses else 0.0 + + clf.eval() + with torch.no_grad(): + va_loss = float(loss_fn(clf(x_va), y_va).item()) if x_va.shape[0] else tr_loss + if va_loss < best_val: + best_val = va_loss + best_state = {k: v.detach().cpu().clone() for k, v in clf.state_dict().items()} + if ep == 1 or ep % 5 == 0 or ep == epochs: + print(f" Ep {ep:>3d} train={tr_loss:.4f} val={va_loss:.4f}") + + if best_state is not None: + clf.load_state_dict(best_state) + clf.eval() + print(f" Best val loss: {best_val:.4f}") + + if save_path: + torch.save(clf.state_dict(), save_path) + print(f" Classifier saved: {save_path}") + return clf + + +def generate_guided( + model, + src: torch.Tensor, + classifier: QualityClassifier, + guidance_scale: float = 1.0, + temperature: float = 0.8, + top_k: int = 40, +): + inner = model.model + T = inner.scheduler.num_timesteps + device = next(inner.parameters()).device + if src.dim() == 1: + src = src.unsqueeze(0) + src = src.to(device) + B = src.shape[0] + tgt_len = inner.max_seq_len + mask_id = inner.mask_token_id + + memory, src_pad_mask = inner.encode_source(src) + x0_est = torch.full((B, tgt_len), mask_id, dtype=torch.long, device=device) + hint = None + + inner.eval() + classifier.eval() + + for t_val in range(T - 1, -1, -1): + t = torch.full((B,), t_val, dtype=torch.long, device=device) + is_last = t_val == 0 + + with torch.no_grad(): + logits, _ = inner.forward_cached(memory, src_pad_mask, x0_est, t, x0_hint=hint, inference_mode=True) + hidden = getattr(inner, "_last_hidden", None) + + if guidance_scale > 0.0 and hidden is not None: + hidden_leaf = hidden.detach().requires_grad_(True) + q = classifier(hidden_leaf).sum() + grad = torch.autograd.grad(q, hidden_leaf, retain_graph=False, create_graph=False)[0] + grad = grad / (grad.norm(dim=-1, keepdim=True) + 1e-6) + logit_grad = torch.matmul(grad, inner.head.weight.T) + logits = logits + (1.5 * guidance_scale) * torch.clamp(logit_grad, -6.0, 6.0) + + logits = logits / max(float(temperature), 1e-8) + if top_k > 0 and top_k < logits.shape[-1]: + vals, _ = torch.topk(logits, int(top_k), dim=-1) + logits = logits.masked_fill(logits < vals[..., -1:], float("-inf")) + + probs = F.softmax(logits, dim=-1) + x0_est = torch.argmax(probs, dim=-1) if is_last else _sample(probs) + hint = x0_est + return x0_est + + +def sweep_guidance_scales( + model, + classifier: QualityClassifier, + src_list: List[torch.Tensor], + ref_list: List[str], + tgt_tokenizer, + scales: List[float] = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0], + n_samples: int = 50, + device=None, + output_dir: str = "analysis/outputs", +) -> Dict: + device = device or next(model.parameters()).device + n = min(n_samples, len(src_list), len(ref_list)) + results = {} + print("\nGuidance scale sweep...") + for scale in scales: + cer_vals = [] + outputs = [] + for src, ref in zip(src_list[:n], ref_list[:n]): + # Higher λ gets slightly sharper decoding and stronger signal. + temp = max(0.55, 0.85 - 0.08 * float(scale)) + k = max(12, int(40 - 4 * float(scale))) + out = generate_guided( + model, src.to(device), classifier, + guidance_scale=float(scale), temperature=temp, top_k=k + ) + pred = _decode_pred(tgt_tokenizer, out) + cer_vals.append(_cer(pred, ref)) + outputs.append(pred) + mean_cer = float(np.mean(cer_vals)) if cer_vals else 1.0 + sent_unique = float(len(set(outputs)) / max(1, len(outputs))) + distinct2 = _distinct_n(outputs, n=2) + self_bleu = _self_bleu(outputs) + self_bleu_div = 1.0 - self_bleu + diversity = float(0.5 * distinct2 + 0.5 * self_bleu_div) + results[float(scale)] = { + "mean_cer": mean_cer, + "diversity": diversity, + "sent_unique": sent_unique, + "distinct2": distinct2, + "self_bleu": self_bleu, + } + print( + f" λ={float(scale):.1f} CER={mean_cer:.4f} " + f"div={diversity:.3f} d2={distinct2:.3f} sBLEU={self_bleu:.3f}" + ) + + os.makedirs(output_dir, exist_ok=True) + try: + import matplotlib.pyplot as plt + xs = sorted(results.keys()) + ys_c = [results[x]["mean_cer"] for x in xs] + ys_d = [results[x]["diversity"] for x in xs] + ys_d2 = [results[x]["distinct2"] for x in xs] + fig, ax = plt.subplots(1, 3, figsize=(13, 4)) + ax[0].plot(xs, ys_c, marker="o") + ax[0].set_xlabel("Guidance scale λ") + ax[0].set_ylabel("CER (lower is better)") + ax[0].set_title("Quality vs Guidance") + ax[1].plot(xs, ys_d, marker="o") + ax[1].set_xlabel("Guidance scale λ") + ax[1].set_ylabel("Composite diversity") + ax[1].set_title("Diversity vs Guidance") + ax[2].plot(xs, ys_d2, marker="o") + ax[2].set_xlabel("Guidance scale λ") + ax[2].set_ylabel("Distinct-2") + ax[2].set_title("Distinct-2 vs Guidance") + plt.tight_layout() + plt.savefig(os.path.join(output_dir, "task5_quality_diversity_tradeoff.png"), dpi=150, bbox_inches="tight") + plt.close() + except Exception: + pass + + with open(os.path.join(output_dir, "task5_guidance_results.json"), "w", encoding="utf-8") as f: + json.dump({str(k): v for k, v in results.items()}, f, indent=2) + return results + + +def sweep_guidance( + model, + classifier, + src_list, + ref_list, + tgt_tokenizer, + scales=[0.0, 0.5, 1.0, 1.5, 2.0, 3.0], + n_samples=50, +): + results = sweep_guidance_scales( + model=model, + classifier=classifier, + src_list=src_list, + ref_list=ref_list, + tgt_tokenizer=tgt_tokenizer, + scales=scales, + n_samples=n_samples, + output_dir="analysis/outputs", + ) + return { + float(k): {"CER": v["mean_cer"], "diversity": v["diversity"]} + for k, v in results.items() + } diff --git a/analysis/run_analysis.py b/analysis/run_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..b14a4f1ea30ff5eda8bf881597d05919e93fa68c --- /dev/null +++ b/analysis/run_analysis.py @@ -0,0 +1,1245 @@ +""" +analysis/run_analysis.py +========================= +Entry point for all 5 tasks. + +Tasks: + Task 1 — KV Cache benchmark (no retraining) + Task 2 — Attention viz + drift (no retraining) + Task 3 — Concept vectors + PCA steer (no retraining) + Task 4 — Step ablation (REQUIRES retraining for each T) + Task 5 — Classifier-free guidance (trains small 10k-param classifier) + +Usage: + python analysis/run_analysis.py --task 1 + python analysis/run_analysis.py --task 2 --input "dharmo rakṣati rakṣitaḥ" + python analysis/run_analysis.py --task 3 + python analysis/run_analysis.py --task 4 --phase generate_configs + python analysis/run_analysis.py --task 4 --phase analyze + python analysis/run_analysis.py --task 5 + python analysis/run_analysis.py --task all --input "satyameva jayate" + +Output files: analysis/outputs/ +""" + +import copy +import torch +import os, sys, argparse, json +import numpy as np +import time +import gc +import tracemalloc +import threading +import resource +from difflib import SequenceMatcher +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +try: + import psutil +except Exception: + psutil = None + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from config import CONFIG +from inference import load_model, _decode_with_cleanup, _iast_to_deva +from model.tokenizer import SanskritSourceTokenizer, SanskritTargetTokenizer + +OUTPUT_DIR = "analysis/outputs" +os.makedirs(OUTPUT_DIR, exist_ok=True) + +# Keep caches writable/project-local for laptops and sandboxed runners. +_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +os.environ.setdefault("HF_HOME", os.path.join(_ROOT, ".hf_cache")) +os.environ.setdefault("HF_DATASETS_CACHE", os.path.join(_ROOT, ".hf_cache", "datasets")) +os.environ.setdefault("HF_HUB_CACHE", os.path.join(_ROOT, ".hf_cache", "hub")) +os.environ.setdefault("MPLCONFIGDIR", os.path.join(_ROOT, ".mplconfig")) +for _p in [ + os.environ["HF_HOME"], + os.environ["HF_DATASETS_CACHE"], + os.environ["HF_HUB_CACHE"], + os.environ["MPLCONFIGDIR"], +]: + os.makedirs(_p, exist_ok=True) + + +def _process_mem_mb() -> float: + if psutil is not None: + try: + return float(psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)) + except Exception: + pass + # Linux fallback: /proc/self/statm current RSS pages. + try: + with open("/proc/self/statm", "r", encoding="utf-8") as f: + parts = f.read().strip().split() + if len(parts) >= 2: + rss_pages = int(parts[1]) + page_size = os.sysconf("SC_PAGE_SIZE") + return float(rss_pages * page_size / (1024 * 1024)) + except Exception: + pass + # Unix fallback: max RSS from resource (platform-dependent units). + try: + ru = float(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) + # Heuristic: macOS tends to return bytes, Linux tends KB. + if ru > 10_000_000: + return ru / (1024 * 1024) + return ru / 1024.0 + except Exception: + return 0.0 + + +# ── Shared loader ───────────────────────────────────────────────────── + +def infer_model_type_from_checkpoint(ckpt_path: str) -> str: + name = ckpt_path.lower() + if "ablation_results/t" in name or "d3pm_cross_attention" in name: + return "d3pm_cross_attention" + if "d3pm_encoder_decoder" in name: + return "d3pm_encoder_decoder" + if "baseline_cross_attention" in name: + return "baseline_cross_attention" + if "baseline_encoder_decoder" in name: + return "baseline_encoder_decoder" + return CONFIG["model_type"] + + +def infer_include_negative_from_checkpoint(ckpt_path: str) -> bool: + name = ckpt_path.lower() + if "_neg_true" in name: + return True + if "_neg_false" in name: + return False + if "ablation_results/t" in name: + return False + return CONFIG["data"]["include_negative_examples"] + + +def load_everything(cfg, device, ckpt_override=None): + model_name = cfg['model_type'] + has_neg = cfg['data']['include_negative_examples'] + candidates = [ + f"results7/{model_name}_neg_{has_neg}/best_model.pt", + f"results/{model_name}_neg_{has_neg}/best_model.pt", + f"results7/{model_name}_neg_True/best_model.pt", + f"results/{model_name}_neg_True/best_model.pt", + f"results7/{model_name}_neg_False/best_model.pt", + f"results/{model_name}_neg_False/best_model.pt", + "ablation_results/T4/best_model.pt", + "ablation_results/T8/best_model.pt", + ] + ckpt = ckpt_override if ckpt_override else next((p for p in candidates if os.path.exists(p)), None) + if not os.path.exists(ckpt): + raise FileNotFoundError(f"No checkpoint found. Checked: {candidates}") + model, cfg = load_model(ckpt, cfg, device) + model.eval() + src_tok = SanskritSourceTokenizer( + vocab_size=cfg['model'].get('src_vocab_size', 500), + max_len=cfg['model']['max_seq_len']) + tgt_tok = SanskritTargetTokenizer( + vocab_size=cfg['model'].get('tgt_vocab_size', 500), + max_len=cfg['model']['max_seq_len']) + return model, src_tok, tgt_tok, cfg + + +def load_val_data(cfg, src_tok, tgt_tok, n=500): + """Load validation set as (src_tensors, ref_strings, input_strings).""" + from data.dataset import OptimizedSanskritDataset + from torch.utils.data import Subset + from sklearn.model_selection import train_test_split + + dataset = OptimizedSanskritDataset( + 'train', max_len=cfg['model']['max_seq_len'], + cfg=cfg, src_tokenizer=src_tok, tgt_tokenizer=tgt_tok) + total = min(cfg['data']['dataset_size'], len(dataset)) + _, val_idx = train_test_split(list(range(total)), train_size=0.8, random_state=42) + val_idx = val_idx[:n] + + src_list, ref_list, inp_list = [], [], [] + for i in val_idx: + item = dataset[i] + src_list.append(item['input_ids'].unsqueeze(0)) + ref_list.append(item['target_text']) + inp_list.append(item['input_text']) + return src_list, ref_list, inp_list + + +def _generate_ids_compat(model, src, num_steps=None, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0): + kwargs = dict(temperature=temperature, top_k=top_k) + if num_steps is not None: + kwargs["num_steps"] = int(num_steps) + if repetition_penalty is not None: + kwargs["repetition_penalty"] = float(repetition_penalty) + if diversity_penalty is not None: + kwargs["diversity_penalty"] = float(diversity_penalty) + try: + return model.generate(src, **kwargs) + except TypeError: + # Some model variants expose reduced generate() kwargs. + slim = {k: kwargs[k] for k in ["temperature", "top_k", "num_steps"] if k in kwargs} + try: + return model.generate(src, **slim) + except TypeError: + return model.generate(src) + + +def _decode_ids(tgt_tok, out_ids, src_text=None, inf_cfg=None): + ids = [] + for x in out_ids[0].tolist(): + # stop at PAD/SEP once decoding started + if x in (1, 4) and ids: + break + if x > 4: + ids.append(x) + if src_text is not None and inf_cfg is not None: + txt = _decode_with_cleanup(tgt_tok, ids, src_text, inf_cfg) + else: + txt = tgt_tok.decode(ids).strip() + return txt, ids + + +def _cer(a: str, b: str) -> float: + m, n = len(a), len(b) + if m == 0 and n == 0: + return 0.0 + dp = list(range(n + 1)) + for i in range(1, m + 1): + prev, dp[0] = dp[0], i + for j in range(1, n + 1): + tmp = dp[j] + dp[j] = prev if a[i-1] == b[j-1] else 1 + min(prev, dp[j], dp[j-1]) + prev = tmp + return float(dp[n]) / max(1, m, n) + + +# ── Task 1 ──────────────────────────────────────────────────────────── + +def run_task1(model, src_tok, device): + print("\n" + "="*65) + print(" TASK 1 — KV Cache Benchmark") + print("="*65) + src_vocab = model.model.src_embed.token_emb.weight.shape[0] + src_lens = [16, 32, 64] + n_runs = 3 + has_cached = hasattr(model, "generate_cached") + if not has_cached: + print(" Compatibility mode: generate_cached() unavailable; running standard benchmark only.") + + def _timeit(fn, runs=n_runs): + vals = [] + for _ in range(runs): + t0 = time.perf_counter() + fn() + vals.append(time.perf_counter() - t0) + return float(np.mean(vals)) + + def _trace_peak_bytes(fn, repeat=8): + gc.collect() + tracemalloc.start() + for _ in range(max(1, int(repeat))): + fn() + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + return int(peak) + + def _torch_cpu_mem_bytes(fn): + try: + from torch.profiler import profile, ProfilerActivity + with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=False) as prof: + fn() + mem = 0 + for ev in prof.key_averages(): + try: + mem += max(0, int(getattr(ev, "self_cpu_memory_usage", 0))) + except Exception: + pass + return int(mem) + except Exception: + return 0 + + results = {} + for L in src_lens: + src = torch.randint(5, src_vocab, (1, L), device=device) + t_std = _timeit(lambda: _generate_ids_compat(model, src, temperature=0.8, top_k=40)) + + if has_cached: + t_cache = _timeit( + lambda: model.generate_cached( + src, num_steps=64, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0 + ) + ) + speedup = t_std / max(t_cache, 1e-9) + else: + t_cache = t_std + speedup = 1.0 + + # Encoder cost estimate: one encode_source pass vs one cached step. + if hasattr(model.model, "encode_source") and hasattr(model.model, "forward_cached"): + memory, src_pad = model.model.encode_source(src) + x = torch.full((1, L), model.model.mask_token_id, dtype=torch.long, device=device) + t = torch.full((1,), max(0, model.model.scheduler.num_timesteps - 1), dtype=torch.long, device=device) + t_enc = _timeit(lambda: model.model.encode_source(src)) + t_step = _timeit(lambda: model.model.forward_cached(memory, src_pad, x, t, x0_hint=None, inference_mode=True)) + encoder_pct = (t_enc / max(t_enc + t_step, 1e-9)) * 100.0 + else: + encoder_pct = 0.0 + + results[L] = dict( + standard_s=t_std, + cached_s=t_cache, + speedup=speedup, + encoder_pct=encoder_pct, + ) + print(f" src_len={L:>3d} standard={t_std:.3f}s cached={t_cache:.3f}s speedup={speedup:.2f}x encoder%={encoder_pct:.1f}") + + # Memory profiling (GPU preferred, CPU/MPS fallback via process RSS delta). + mem_note = "N/A" + mem_red = None + if torch.cuda.is_available() and str(device).startswith("cuda"): + L = 64 + src = torch.randint(5, src_vocab, (1, L), device=device) + torch.cuda.reset_peak_memory_stats(device) + _ = _generate_ids_compat(model, src, temperature=0.8, top_k=40) + m_std = torch.cuda.max_memory_allocated(device) + torch.cuda.reset_peak_memory_stats(device) + _ = model.generate_cached(src, num_steps=64, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0) + m_cache = torch.cuda.max_memory_allocated(device) + mem_red = 100.0 * (m_std - m_cache) / max(m_std, 1) + mem_note = f"GPU peak alloc reduction: {mem_red:.1f}% @ src_len=64" + print(f" Memory reduction: {mem_note}") + elif has_cached and _process_mem_mb() > 0.0: + L = 64 + src = torch.randint(5, src_vocab, (1, L), device=device) + + def _peak_rss_while(fn, poll_s=0.01): + done = {"v": False} + peak = {"v": _process_mem_mb()} + + def _poll(): + while not done["v"]: + peak["v"] = max(peak["v"], _process_mem_mb()) + time.sleep(poll_s) + th = threading.Thread(target=_poll, daemon=True) + gc.collect() + base = _process_mem_mb() + th.start() + try: + fn() + finally: + done["v"] = True + th.join(timeout=0.1) + gc.collect() + return base, peak["v"], max(0.0, peak["v"] - base) + + b_std, p_std, d_std = _peak_rss_while( + lambda: _generate_ids_compat(model, src, temperature=0.8, top_k=40) + ) + b_c, p_c, d_c = _peak_rss_while( + lambda: model.generate_cached( + src, num_steps=64, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0 + ) + ) + if d_std > 0.0: + mem_red = 100.0 * (d_std - d_c) / d_std + mem_note = ( + f"RSS peak reduction: {mem_red:.1f}% @ src_len=64 " + f"(std_peak={p_std:.1f}MB, cache_peak={p_c:.1f}MB)" + ) + else: + # Secondary fallback: Python allocator peak (always available). + peak_std = _trace_peak_bytes( + lambda: _generate_ids_compat(model, src, temperature=0.8, top_k=40), repeat=10 + ) + peak_cache = _trace_peak_bytes( + lambda: model.generate_cached(src, num_steps=64, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0), + repeat=10 + ) + if peak_std >= 256 * 1024: + mem_red = 100.0 * (peak_std - peak_cache) / peak_std + mem_note = ( + f"Py alloc peak reduction: {mem_red:.1f}% @ src_len=64 " + f"(std={peak_std/1024**2:.1f}MB, cache={peak_cache/1024**2:.1f}MB)" + ) + else: + cpu_std = _torch_cpu_mem_bytes( + lambda: _generate_ids_compat(model, src, temperature=0.8, top_k=40) + ) + cpu_cache = _torch_cpu_mem_bytes( + lambda: model.generate_cached(src, num_steps=64, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0) + ) + if cpu_std > 0: + mem_red = 100.0 * (cpu_std - cpu_cache) / max(cpu_std, 1) + mem_note = ( + f"Torch CPU mem-event reduction: {mem_red:.1f}% @ src_len=64 " + f"(std={cpu_std/1024**2:.1f}MB, cache={cpu_cache/1024**2:.1f}MB)" + ) + else: + mem_note = "Memory estimate unavailable (RSS/tracemalloc/torch-profiler flat)" + print(f" Memory reduction: {mem_note}") + elif has_cached: + # Final fallback (CPU-safe): Python allocation peak via tracemalloc. + # This does not include all native tensor allocator memory, but still + # gives a consistent relative signal when psutil/CUDA stats are absent. + L = 64 + src = torch.randint(5, src_vocab, (1, L), device=device) + peak_std = _trace_peak_bytes( + lambda: _generate_ids_compat(model, src, temperature=0.8, top_k=40), repeat=10 + ) + peak_cache = _trace_peak_bytes( + lambda: model.generate_cached(src, num_steps=64, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0), + repeat=10 + ) + # Ignore extremely small peaks; they are noise for tensor-heavy paths. + if peak_std >= 256 * 1024: + mem_red = 100.0 * (peak_std - peak_cache) / peak_std + mem_note = ( + f"Py alloc peak reduction: {mem_red:.1f}% @ src_len=64 " + f"(std={peak_std/1024**2:.1f}MB, cache={peak_cache/1024**2:.1f}MB)" + ) + else: + cpu_std = _torch_cpu_mem_bytes( + lambda: _generate_ids_compat(model, src, temperature=0.8, top_k=40) + ) + cpu_cache = _torch_cpu_mem_bytes( + lambda: model.generate_cached(src, num_steps=64, temperature=0.8, top_k=40, + repetition_penalty=1.2, diversity_penalty=0.0) + ) + if cpu_std > 0: + mem_red = 100.0 * (cpu_std - cpu_cache) / max(cpu_std, 1) + mem_note = ( + f"Torch CPU mem-event reduction: {mem_red:.1f}% @ src_len=64 " + f"(std={cpu_std/1024**2:.1f}MB, cache={cpu_cache/1024**2:.1f}MB)" + ) + else: + mem_note = "Py alloc peak too small/noisy to estimate (no psutil/CUDA profiler)" + print(f" Memory reduction: {mem_note}") + else: + mem_note = "Profiler unavailable (cached path missing)" + + # Subtask graphs + lens = sorted(results.keys()) + std_vals = [results[L]["standard_s"] for L in lens] + cache_vals = [results[L]["cached_s"] for L in lens] + speed_vals = [results[L]["speedup"] for L in lens] + enc_vals = [results[L]["encoder_pct"] for L in lens] + + plt.figure(figsize=(7, 4)) + plt.plot(lens, std_vals, marker="o", label="standard") + plt.plot(lens, cache_vals, marker="o", label="cached") + plt.xlabel("Source length") + plt.ylabel("Time (s)") + plt.title("Task1: Generation Time (Standard vs Cached)") + plt.legend() + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task1_time_comparison.png"), dpi=150, bbox_inches="tight") + plt.close() + + plt.figure(figsize=(7, 4)) + plt.plot(lens, speed_vals, marker="o") + plt.xlabel("Source length") + plt.ylabel("Speedup (x)") + plt.title("Task1: KV-Cache Speedup") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task1_speedup.png"), dpi=150, bbox_inches="tight") + plt.close() + + plt.figure(figsize=(7, 4)) + plt.plot(lens, enc_vals, marker="o") + plt.xlabel("Source length") + plt.ylabel("Encoder cost (%)") + plt.title("Task1: Encoder Cost Share") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task1_encoder_cost.png"), dpi=150, bbox_inches="tight") + plt.close() + + path = os.path.join(OUTPUT_DIR, "task1_kv_cache.txt") + with open(path, "w") as f: + f.write("TASK 1 — KV CACHE BENCHMARK\n" + "="*40 + "\n\n") + f.write(f"has_generate_cached={has_cached}\n") + f.write(f"memory_profile={mem_note}\n\n") + f.write(f"{'src_len':>8} {'standard(s)':>12} {'cached(s)':>10} " + f"{'speedup':>8} {'encoder%':>9}\n") + for src_len, r in results.items(): + f.write(f"{src_len:>8} {r['standard_s']:>12.3f} {r['cached_s']:>10.3f} " + f"{r['speedup']:>7.2f}x {r['encoder_pct']:>8.1f}%\n") + f.write("\nSaved graphs:\n") + f.write(" - task1_time_comparison.png\n") + f.write(" - task1_speedup.png\n") + f.write(" - task1_encoder_cost.png\n") + print(f" Saved: {path}") + + +# ── Task 2 ──────────────────────────────────────────────────────────── + +def run_task2(model, src_tok, tgt_tok, device, input_text, cfg, corpus_inputs=None): + print("\n" + "="*65) + print(" TASK 2 — Attention Visualization + Semantic Drift") + print("="*65) + print(f" Input: {input_text}") + if not hasattr(model.model, 'encode_source'): + print(" Compatibility mode: attention hooks unavailable; running semantic-drift-only analysis.") + src_ids = src_tok.encode(input_text) + src = torch.tensor([src_ids], dtype=torch.long, device=device) + # Keep steps <= scheduler horizon for this checkpoint to avoid backend aborts. + t_sched = int(getattr(getattr(model.model, "scheduler", object()), "num_timesteps", 64)) + # Stability guard for some checkpoints/backends: keep sweep moderate. + t_max = min(t_sched, 64) + candidates = [t_max, 48, 32, 24, 16, 8, 4, 1] + step_list = [] + seen = set() + for s in candidates: + s = max(1, min(int(s), t_max)) + if s not in seen: + step_list.append(s) + seen.add(s) + outs = {} + for s in step_list: + out = _generate_ids_compat(model, src, num_steps=s, temperature=0.8, top_k=40) + txt, _ = _decode_ids( + tgt_tok, out, + src_text=input_text, + inf_cfg=cfg.get("inference", {"temperature": 0.8, "top_k": 40}) + ) + outs[s] = txt + final = outs[1] + drift = [(_cer(outs[s], final), s) for s in step_list] + # Plot drift + xs = [s for _, s in drift] + ys = [c for c, _ in drift] + plt.figure(figsize=(8, 4)) + plt.plot(xs, ys, marker='o') + plt.gca().invert_xaxis() + plt.xlabel("Generation steps") + plt.ylabel("CER to 1-step output") + plt.title("Task2 Semantic Drift (Compatibility Mode)") + plt.tight_layout() + plot_path = os.path.join(OUTPUT_DIR, "task2_semantic_drift.png") + plt.savefig(plot_path, dpi=150, bbox_inches="tight") + plt.close() + report = os.path.join(OUTPUT_DIR, "task2_report.txt") + with open(report, "w", encoding="utf-8") as f: + f.write("TASK 2 — COMPATIBILITY REPORT\n") + f.write("="*40 + "\n") + f.write("Cross-attention capture unavailable for this checkpoint.\n") + f.write(f"Input: {input_text}\n") + f.write(f"Reference final (1 step): {final}\n\n") + for cer_v, s in drift: + f.write(f"steps={s:>3d} CER_to_final={cer_v:.4f} output={outs[s][:120]}\n") + print(f" Output(final@1): {final}") + print(f" Report: {report}") + print(f" Saved: {plot_path}") + return + + src_ids = src_tok.encode(input_text) + src_tensor = torch.tensor([src_ids], dtype=torch.long, device=device) + + from analysis.attention_viz import ( + AttentionCapture, + compute_trajectory_metrics, + analyze_token_stability, + tfidf_attention_correlation, + ) + + # Attention capture + print(" Capturing attention weights...") + capturer = AttentionCapture(model) + step_weights, step_outputs_ids = capturer.run(src_tensor) + + def _decode_tensor_ids(t): + out = [] + for x in t[0].tolist(): + if x in (1, 4) and out: + break + if x > 4: + out.append(x) + raw_txt = tgt_tok.decode(out).strip() + clean_txt = _decode_with_cleanup( + tgt_tok, out, input_text, cfg.get("inference", {"temperature": 0.8, "top_k": 40}) + ) + return raw_txt, clean_txt, out + + decoded = {} + decoded_raw = {} + for t_val, ids_t in step_outputs_ids.items(): + raw_txt, clean_txt, ids = _decode_tensor_ids(ids_t) + decoded_raw[t_val] = (raw_txt, ids) + decoded[t_val] = (clean_txt, ids) + final_step = min(decoded.keys()) + final_out, final_ids = decoded[final_step] + final_out_raw = decoded_raw[final_step][0] + src_labels = [] + for sid in src_ids[:20]: + tok = src_tok.decode([sid]).strip() + src_labels.append(tok if tok else f"id{sid}") + tgt_labels = [f"y{i}" for i in range(min(20, len(final_ids)))] + print(f" Output: {final_out}") + + # Heatmap t=max, layer 0 + first_t = max(step_weights.keys()) + w_first = step_weights[first_t][0][0] + w0 = step_weights[0][0][0] + n_src = min(len(src_labels), w_first.shape[1], 20) + n_tgt = min(len(tgt_labels), w_first.shape[0], 20) + plt.figure(figsize=(max(8, n_src * 0.35), max(6, n_tgt * 0.3))) + plt.imshow(w_first[:n_tgt, :n_src], aspect="auto", cmap="YlOrRd") + plt.xticks(range(n_src), src_labels[:n_src], rotation=45, ha="right", fontsize=8) + plt.yticks(range(n_tgt), tgt_labels[:n_tgt], fontsize=8) + plt.title(f"Attention t={first_t} Layer 0") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, f"task2_attn_t{first_t}.png"), dpi=150, bbox_inches="tight") + plt.close() + + plt.figure(figsize=(max(8, n_src * 0.35), max(6, n_tgt * 0.3))) + plt.imshow(w0[:n_tgt, :n_src], aspect="auto", cmap="YlOrRd") + plt.xticks(range(n_src), src_labels[:n_src], rotation=45, ha="right", fontsize=8) + plt.yticks(range(n_tgt), tgt_labels[:n_tgt], fontsize=8) + plt.title("Attention t=0 Layer 0") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task2_attn_t0.png"), dpi=150, bbox_inches="tight") + plt.close() + + # All layers at t=0 + layers = step_weights[0] + n_layers = len(layers) + n_cols = min(4, n_layers) + n_rows = (n_layers + n_cols - 1) // n_cols + fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3.2)) + axes = np.array(axes).reshape(-1) + for i, layer_w in enumerate(layers): + ax = axes[i] + w = layer_w[0][:n_tgt, :n_src] + ax.imshow(w, aspect="auto", cmap="YlOrRd") + ax.set_title(f"Layer {i}", fontsize=9) + ax.set_xticks([]) + ax.set_yticks([]) + for i in range(n_layers, len(axes)): + axes[i].axis("off") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task2_all_layers_t0.png"), dpi=150, bbox_inches="tight") + plt.close() + + # Attention evolution for src[0] -> tgt[0] + t_vals_desc = sorted(step_weights.keys(), reverse=True) + evo = [] + for t_val in t_vals_desc: + w = step_weights[t_val][0][0] + evo.append(float(w[0, 0]) if w.shape[0] > 0 and w.shape[1] > 0 else 0.0) + plt.figure(figsize=(10, 3.5)) + plt.plot(range(len(t_vals_desc)), evo, marker="o") + plt.xlabel("Captured step index (T→0)") + plt.ylabel("Attention weight") + plt.title("Attention Evolution (src0→tgt0)") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task2_attn_evolution.png"), dpi=150, bbox_inches="tight") + plt.close() + + # Drift (CER to final across steps) on RAW decoded trajectory to expose true diffusion. + t_vals = sorted(decoded.keys(), reverse=True) + cer_vals = [_cer(decoded_raw[t][0], final_out_raw) for t in t_vals] + plt.figure(figsize=(8, 4)) + plt.plot(t_vals, cer_vals, marker="o") + plt.gca().invert_xaxis() + plt.xlabel("Diffusion step") + plt.ylabel("CER to final") + plt.title("Task2 Semantic Drift") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task2_semantic_drift.png"), dpi=150, bbox_inches="tight") + plt.close() + + # Source alignment proxy (avg attention on source positions at t=0, last layer) + last_layer_t0 = step_weights[0][-1][0] + src_align = last_layer_t0.mean(axis=0)[:n_src] + plt.figure(figsize=(8, 3)) + plt.bar(np.arange(len(src_align)), src_align) + plt.xticks(range(n_src), src_labels[:n_src], rotation=45, ha="right", fontsize=8) + plt.title("Source Alignment Importance (t=0, last layer)") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task2_source_alignment.png"), dpi=150, bbox_inches="tight") + plt.close() + + stability = analyze_token_stability(step_weights) + n_locked = sum(1 for v in stability.values() if v == "LOCKED") + n_flex = sum(1 for v in stability.values() if v == "FLEXIBLE") + tfidf_info = tfidf_attention_correlation(input_text, step_weights, corpus_texts=corpus_inputs) + tfidf_corr = tfidf_info.get("corr") + tfidf_status = tfidf_info.get("status", "UNKNOWN") + traj = compute_trajectory_metrics( + step_outputs_ids, + tgt_tok, + reference_text=_iast_to_deva(input_text), + ) + # Keep trajectory semantic scoring on raw decoded text to avoid masking drift. + ref_text = _iast_to_deva(input_text) + for row in traj: + t_cur = row["step"] + raw_txt = decoded_raw.get(t_cur, ("", []))[0] + if raw_txt: + sim = max(0.0, 1.0 - _cer(raw_txt, ref_text)) + row["text"] = raw_txt + row["bert"] = sim + row["drift"] = 1.0 - sim + + # TF-IDF vs attention graph (subtask visualization) + tfidf_vec = np.asarray(tfidf_info.get("tfidf_scores", []), dtype=np.float32) + attn_vec = np.asarray(tfidf_info.get("attn_scores", []), dtype=np.float32) + labels = list(tfidf_info.get("tokens", [])) + m = min(len(tfidf_vec), len(attn_vec), len(labels), 20) + if m > 0: + x = np.arange(m) + plt.figure(figsize=(8, 3.5)) + tf_part = tfidf_vec[:m] + at_part = attn_vec[:m] + tf_norm = tf_part / (np.max(np.abs(tf_part)) + 1e-9) + at_norm = at_part / (np.max(np.abs(at_part)) + 1e-9) + w = 0.4 + plt.bar(x - w/2, tf_norm, width=w, label="tfidf(norm)") + plt.bar(x + w/2, at_norm, width=w, label="attn(norm)") + plt.xlabel("Source token") + plt.ylabel("Normalized score") + plt.title("Task2: TF-IDF vs Attention Stability") + plt.xticks(x, labels[:m], rotation=45, ha="right", fontsize=8) + plt.legend() + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task2_tfidf_vs_attention.png"), dpi=150, bbox_inches="tight") + plt.close() + + lock_in_t = next((t for t, c in zip(t_vals[::-1], cer_vals[::-1]) if c <= 0.05), t_vals[-1]) + if tfidf_corr is not None and abs(float(tfidf_corr)) < 0.10: + tfidf_status = "WEAK" + has_semantic = any(float(r.get("bert", 0.0)) > 0.05 for r in traj) + # Degeneracy score on final output + toks = [t for t in final_out.split() if t] + uniq_ratio = len(set(toks)) / max(1, len(toks)) + degenerate = (len(toks) >= 8 and uniq_ratio < 0.35) + + # Small multi-sample stability check (prevents overclaim from one example) + multi_scores = [] + if corpus_inputs: + sample_texts = [s for s in corpus_inputs[:8] if isinstance(s, str) and s.strip()] + for txt in sample_texts: + src_i = torch.tensor([src_tok.encode(txt)], dtype=torch.long, device=device) + out_i = _generate_ids_compat(model, src_i, num_steps=min(16, cfg.get("inference", {}).get("num_steps", 16)), + temperature=0.8, top_k=40) + pred_i, _ = _decode_ids(tgt_tok, out_i) + multi_scores.append(max(0.0, 1.0 - _cer(pred_i, _iast_to_deva(txt)))) + multi_sem = float(np.mean(multi_scores)) if multi_scores else 0.0 + + quality_status = ( + "VALID" + if len(final_out.strip()) > 0 and n_flex + n_locked > 0 and has_semantic and not degenerate and multi_sem >= 0.05 + else "WEAK" + ) + report = os.path.join(OUTPUT_DIR, "task2_report.txt") + with open(report, "w", encoding="utf-8") as f: + f.write("TASK 2 — ATTENTION + DRIFT REPORT\n" + "=" * 50 + "\n\n") + f.write(f"Input : {input_text}\n") + f.write(f"Output: {final_out}\n\n") + f.write(f"Captured steps: {len(t_vals)}\n") + f.write(f"Analysis quality: {quality_status}\n") + f.write(f"Final output uniq-ratio: {uniq_ratio:.3f}\n") + f.write(f"Degenerate output: {'YES' if degenerate else 'NO'}\n") + f.write(f"Multi-sample semantic score (n<={len(multi_scores)}): {multi_sem:.4f}\n") + f.write(f"Lock-in step (CER<=0.05): t={lock_in_t}\n") + f.write(f"Locked tokens: {n_locked} Flexible tokens: {n_flex}\n") + corr_txt = f"{tfidf_corr:.4f}" if tfidf_corr is not None else "N/A" + f.write(f"TF-IDF vs attention stability corr: {corr_txt}\n") + f.write(f"TF-IDF status: {tfidf_status}\n\n") + f.write("Saved graphs:\n") + f.write(" - task2_attn_t*.png / task2_all_layers_t0.png\n") + f.write(" - task2_attn_evolution.png\n") + f.write(" - task2_semantic_drift.png\n") + f.write(" - task2_source_alignment.png\n") + f.write(" - task2_tfidf_vs_attention.png\n\n") + f.write("Step trajectory (first 10 rows)\n") + f.write("-" * 60 + "\n") + for row in traj[:10]: + f.write(f"t={row['step']:>3d} bert={row['bert']:.4f} drift={row['drift']:.4f} text={row['text'][:60]}\n") + + print(f" Lock-in timestep: t={lock_in_t}") + print(f" Locked/Flexible: {n_locked}/{n_flex}") + corr_txt = f"{tfidf_corr:.4f}" if tfidf_corr is not None else "N/A" + print(f" TF-IDF corr: {corr_txt} ({tfidf_status})") + print(f" Report: {report}") + + +# ── Task 3 ──────────────────────────────────────────────────────────── + +def run_task3(model, src_tok, tgt_tok, device, src_list, ref_list, n_samples=500): + print("\n" + "="*65) + print(" TASK 3 — Concept Vectors + PCA Steering") + print("="*65) + if not hasattr(model.model, 'encode_source'): + print(" Compatibility mode: using output-token statistics for PCA concept proxy.") + # Keep compatibility run lightweight/stable on constrained backends. + n = min(60, len(src_list)) + feats, lens = [], [] + for i, src in enumerate(src_list[:n]): + out = _generate_ids_compat(model, src.to(device), num_steps=8, temperature=0.8, top_k=40) + txt, ids = _decode_ids(tgt_tok, out) + arr = np.array(ids[:64] + [0] * max(0, 64 - len(ids[:64])), dtype=np.float32) + feats.append(arr) + lens.append(len(txt)) + from sklearn.decomposition import PCA + X = np.stack(feats) + pca = PCA(n_components=min(10, X.shape[0]-1, X.shape[1])) + Z = pca.fit_transform(X) + plt.figure(figsize=(6, 5)) + sc = plt.scatter(Z[:, 0], Z[:, 1] if Z.shape[1] > 1 else np.zeros_like(Z[:, 0]), + c=lens, cmap="viridis", s=14) + plt.colorbar(sc, label="Output length") + plt.title("Task3 Concept Proxy Space (Compatibility Mode)") + plt.tight_layout() + img = os.path.join(OUTPUT_DIR, "task3_concept_space.png") + plt.savefig(img, dpi=150, bbox_inches="tight") + plt.close() + rep = os.path.join(OUTPUT_DIR, "task3_report.txt") + corr = float(np.corrcoef(Z[:, 0], np.array(lens))[0, 1]) if len(lens) > 2 else 0.0 + with open(rep, "w", encoding="utf-8") as f: + f.write("TASK 3 — COMPATIBILITY REPORT\n") + f.write("="*40 + "\n") + f.write("Hidden-state capture unavailable; used output-token vector proxy.\n") + f.write(f"Samples: {n}\n") + f.write(f"PC1-length correlation: {corr:.4f}\n") + print(f" Saved: {img}") + print(f" Report: {rep}") + return + + from analysis.concept_vectors import ( + collect_hidden_states, fit_pca, find_diversity_direction, generate_diversity_spectrum + ) + + # Collect hidden states from val set + n = min(max(1, int(n_samples)), len(src_list)) + print(f" Collecting hidden states from {n} examples...") + hidden, texts, lengths = collect_hidden_states( + model, src_list[:n], tgt_tok, t_capture=0, max_samples=n + ) + + # Fit PCA + find diversity direction + pca = fit_pca(hidden, n_components=min(50, n-1)) + direction = find_diversity_direction(hidden, lengths, pca) + proj = pca.transform(hidden) + corr = float(np.corrcoef(proj[:, 0], np.array(lengths))[0, 1]) if len(lengths) > 2 else 0.0 + if not np.isfinite(corr): + corr = 0.0 + best_pc = 0 + + # Plot concept space + plt.figure(figsize=(8, 6)) + sc = plt.scatter(proj[:, 0], proj[:, 1] if proj.shape[1] > 1 else np.zeros_like(proj[:, 0]), + c=lengths, cmap="viridis", s=14) + plt.colorbar(sc, label="Output diversity proxy") + plt.title("Task3 Concept Space") + plt.xlabel("PC1") + plt.ylabel("PC2") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task3_concept_space.png"), dpi=150, bbox_inches="tight") + plt.close() + + # Subtask graph: explained variance by PCA components + ev = pca.explained_variance_ratio_ + k = min(20, len(ev)) + plt.figure(figsize=(8, 3.5)) + plt.bar(np.arange(k), ev[:k]) + plt.xlabel("PC index") + plt.ylabel("Explained variance ratio") + plt.title("Task3: PCA Explained Variance (Top Components)") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task3_pca_explained_variance.png"), dpi=150, bbox_inches="tight") + plt.close() + + # Generate diversity spectrum on multiple seeds for more stable conclusions + seed_k = min(5, len(src_list)) + uniq_list = [] + sem_list = [] + all_spectra = [] + for i in range(seed_k): + src_i = src_list[i] + spec_i = generate_diversity_spectrum( + model, src_i.to(device), direction, tgt_tok, + alphas=[-2.0, -1.0, 0.0, 1.0, 2.0] + ) + all_spectra.append(spec_i) + spec_items = sorted(spec_i.items()) + spec_texts = [t for _, t in spec_items] + uniq_list.append(len(set(spec_texts)) / max(1, len(spec_texts))) + pivot = spec_texts[2] if len(spec_texts) >= 3 else (spec_texts[0] if spec_texts else "") + sims = [SequenceMatcher(None, txt, pivot).ratio() for txt in spec_texts if txt] + sem_list.append(float(np.mean(sims)) if sims else 0.0) + uniq_ratio = float(np.mean(uniq_list)) if uniq_list else 0.0 + semantic_stability = float(np.mean(sem_list)) if sem_list else 0.0 + steering_valid = (abs(corr) >= 0.20) and (uniq_ratio >= 0.55) and (semantic_stability >= 0.40) + # use first seed spectrum for visualization table + spectrum = all_spectra[0] if all_spectra else {} + + # Subtask graph: alpha vs decoded length + a_vals = sorted(spectrum.keys()) + l_vals = [len(spectrum[a]) for a in a_vals] if spectrum else [] + plt.figure(figsize=(7, 3.5)) + plt.plot(a_vals, l_vals, marker="o") + plt.xlabel("Steering alpha") + plt.ylabel("Output length") + plt.title("Task3: Diversity Steering Curve") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task3_diversity_curve.png"), dpi=150, bbox_inches="tight") + plt.close() + + # Save diversity direction + results + np.save(os.path.join(OUTPUT_DIR, "task3_diversity_direction.npy"), direction) + + report = os.path.join(OUTPUT_DIR, "task3_report.txt") + with open(report, "w", encoding="utf-8") as f: + f.write("TASK 3 — CONCEPT VECTORS + PCA STEERING\n" + "="*50 + "\n\n") + f.write(f"PCA: {pca.n_components_} components, " + f"{pca.explained_variance_ratio_.sum()*100:.1f}% variance\n") + f.write(f"Diversity PC: {best_pc} (|r|={corr:.3f} with diversity proxy)\n\n") + f.write(f"Direction validity: {'VALID' if steering_valid else 'WEAK'}\n") + f.write(f"Spectrum unique ratio (mean over {seed_k} seeds): {uniq_ratio:.3f}\n") + f.write(f"Spectrum semantic stability (mean over {seed_k} seeds): {semantic_stability:.3f}\n\n") + f.write("Saved graphs:\n") + f.write(" - task3_concept_space.png\n") + f.write(" - task3_pca_explained_variance.png\n") + f.write(" - task3_diversity_curve.png\n\n") + f.write("Diversity spectrum:\n") + for alpha, text in sorted(spectrum.items()): + f.write(f" alpha={alpha:+.1f} → {text}\n") + print(f" Report: {report}") + + +# ── Task 4 ──────────────────────────────────────────────────────────── + +def run_task4(phase, model, src_tok, tgt_tok, device, cfg, + src_list, ref_list, n_samples=200): + print("\n" + "="*65) + print(f" TASK 4 — Step Ablation (phase={phase})") + print("="*65) + + import analysis.step_ablation as step_ablation + + # Legacy API + has_legacy = all(hasattr(step_ablation, fn) for fn in [ + "generate_ablation_configs", "run_ablation_analysis", "plot_ablation_3d" + ]) + + # New API + has_new = hasattr(step_ablation, "run_task4") + + if phase == "generate_configs": + if has_legacy: + print(" Generating ablation configs...") + step_ablation.generate_ablation_configs(output_dir="ablation_configs") + print("\n NEXT STEPS:") + print(" 1. bash ablation_configs/train_all.sh") + print(" 2. python analysis/run_analysis.py --task 4 --phase analyze") + return + print(" This step_ablation version does not expose config generation helpers.") + print(" Use your latest ablation training script/config pipeline directly.") + return + + if phase == "analyze": + existing = [T for T in [4, 8, 16, 32, 64] + if os.path.exists(f"ablation_results/T{T}/best_model.pt")] + only_t = os.environ.get("TASK4_ONLY_T") + if only_t and only_t.isdigit(): + t_req = int(only_t) + existing = [T for T in existing if T == t_req] + if not existing: + print(" No ablation models found at ablation_results/T*/best_model.pt") + return + print(f" Found models for T={existing}") + + if has_legacy: + results = step_ablation.run_ablation_analysis( + ablation_dir="ablation_results", base_cfg=cfg, + src_list=src_list[:200], ref_list=ref_list[:200], + tgt_tokenizer=tgt_tok, device=device, + output_dir=OUTPUT_DIR) + step_ablation.plot_ablation_3d( + results, save_path=os.path.join(OUTPUT_DIR, "task4_ablation_3d.png")) + elif has_new: + from inference import load_model as _load_model + models = {} + for T in existing: + ckpt = f"ablation_results/T{T}/best_model.pt" + cfg_t = copy.deepcopy(cfg) + cfg_t["model"]["diffusion_steps"] = T + cfg_t["inference"]["num_steps"] = T + m_t, _ = _load_model(ckpt, cfg_t, device) + m_t.eval() + models[T] = m_t + knee_t = step_ablation.run_task4( + models, src_list[:n_samples], ref_list[:n_samples], tgt_tok, + output_dir=OUTPUT_DIR, n_samples=n_samples) + print(f" New pipeline suggested optimal T={knee_t}") + else: + print(" Unsupported step_ablation API; please sync analysis/step_ablation.py") + return + + # Optional adversarial robustness (legacy helper only) + if hasattr(step_ablation, "run_adversarial_test"): + print("\n Running adversarial robustness test...") + inp_texts = [src_tok.decode([x for x in s[0].tolist() if x > 4]) + for s in src_list[:50]] + step_ablation.run_adversarial_test( + model, src_tok, tgt_tok, + test_inputs=inp_texts, test_refs=ref_list[:50], + device=device, output_dir=OUTPUT_DIR) + + +# ── Task 5 ──────────────────────────────────────────────────────────── + +def run_task5(model, src_tok, tgt_tok, device, cfg, src_list, ref_list, task5_samples=500): + print("\n" + "="*65) + print(" TASK 5 — Classifier-Free Guidance") + print("="*65) + if not hasattr(model.model, 'encode_source'): + print(" Compatibility mode: classifier-guidance unavailable; sweeping decoding controls.") + n = min(100, int(task5_samples), len(src_list), len(ref_list)) + lambdas = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0] + results = [] + for lam in lambdas: + rep_pen = 1.0 + 0.15 * lam + cer_vals, uniq_vals = [], [] + for src, ref in zip(src_list[:n], ref_list[:n]): + out = _generate_ids_compat( + model, src.to(device), num_steps=8, temperature=0.8, top_k=40, + repetition_penalty=rep_pen, diversity_penalty=0.0 + ) + txt, ids = _decode_ids(tgt_tok, out) + cer_vals.append(_cer(txt, ref)) + uniq_vals.append(len(set(ids)) / max(1, len(ids))) + results.append((lam, float(np.mean(cer_vals)), float(np.mean(uniq_vals)))) + print(f" λ={lam:.1f} CER={results[-1][1]:.4f} diversity={results[-1][2]:.3f}") + # Subtask graph: quality-diversity tradeoff + x = [r[1] for r in results] + y = [r[2] for r in results] + labels = [r[0] for r in results] + plt.figure(figsize=(6, 4)) + plt.plot(x, y, marker="o") + for xi, yi, la in zip(x, y, labels): + plt.text(xi, yi, f"λ={la:.1f}", fontsize=8) + plt.xlabel("CER (lower is better)") + plt.ylabel("Diversity") + plt.title("Task5: Quality-Diversity Tradeoff") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task5_quality_diversity_tradeoff.png"), dpi=150, bbox_inches="tight") + plt.close() + rep = os.path.join(OUTPUT_DIR, "task5_report.txt") + with open(rep, "w", encoding="utf-8") as f: + f.write("TASK 5 — COMPATIBILITY REPORT\n") + f.write("="*40 + "\n") + f.write("Guidance classifier path unavailable; λ mapped to repetition penalty.\n\n") + for lam, cer_v, div_v in results: + f.write(f"lambda={lam:.1f} CER={cer_v:.4f} diversity={div_v:.3f}\n") + f.write("\nSaved graphs:\n") + f.write(" - task5_quality_diversity_tradeoff.png\n") + print(f" Report: {rep}") + return + + try: + from analysis.quality_classifier import ( + QualityClassifier, collect_quality_data, + train_quality_classifier, sweep_guidance_scales) + except Exception: + print(" Quality-classifier API mismatch; using compatibility sweep.") + n = min(50, int(task5_samples), len(src_list)) + scales = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0] + results = [] + for lam in scales: + rep_pen = 1.0 + 0.2 * lam + cer_vals, uniq_vals = [], [] + for src, ref in zip(src_list[:n], ref_list[:n]): + out = _generate_ids_compat( + model, src.to(device), num_steps=8, temperature=0.8, top_k=40, + repetition_penalty=rep_pen, diversity_penalty=0.0 + ) + txt, ids = _decode_ids(tgt_tok, out) + cer_vals.append(_cer(txt, ref)) + uniq_vals.append(len(set(ids)) / max(1, len(ids))) + results.append((lam, float(np.mean(cer_vals)), float(np.mean(uniq_vals)))) + print(f" λ={lam:.1f} CER={results[-1][1]:.4f} diversity={results[-1][2]:.3f}") + # Subtask graph: quality-diversity tradeoff + x = [r[1] for r in results] + y = [r[2] for r in results] + labels = [r[0] for r in results] + plt.figure(figsize=(6, 4)) + plt.plot(x, y, marker="o") + for xi, yi, la in zip(x, y, labels): + plt.text(xi, yi, f"λ={la:.1f}", fontsize=8) + plt.xlabel("CER (lower is better)") + plt.ylabel("Diversity") + plt.title("Task5: Quality-Diversity Tradeoff") + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, "task5_quality_diversity_tradeoff.png"), dpi=150, bbox_inches="tight") + plt.close() + rep = os.path.join(OUTPUT_DIR, "task5_report.txt") + with open(rep, "w", encoding="utf-8") as f: + f.write("TASK 5 — COMPATIBILITY REPORT\n") + f.write("="*40 + "\n") + f.write("Guidance classifier path unavailable; λ mapped to repetition penalty.\n\n") + for lam, cer_v, div_v in results: + f.write(f"lambda={lam:.1f} CER={cer_v:.4f} diversity={div_v:.3f}\n") + f.write("\nSaved graphs:\n") + f.write(" - task5_quality_diversity_tradeoff.png\n") + print(f" Report: {rep}") + return + + clf_path = os.path.join(OUTPUT_DIR, "task5_quality_classifier.pt") + d_model = cfg['model']['d_model'] + + # Step 1: collect or load training data + data_path = os.path.join(OUTPUT_DIR, "task5_quality_data.npz") + if os.path.exists(data_path): + print(" Loading cached quality data...") + data = np.load(data_path) + hidden = data["hidden"] + quality = data["quality"] + else: + print(" Collecting quality data (this takes a few minutes)...") + n = min(int(task5_samples), len(src_list)) + hidden, quality = collect_quality_data( + model, src_list[:n], ref_list[:n], tgt_tok, + t_capture=0, max_samples=n) + np.savez(data_path, hidden=hidden, quality=quality) + print(f" Saved quality data: {data_path}") + + # Step 2: train or load classifier + if os.path.exists(clf_path): + print(f" Loading cached classifier: {clf_path}") + clf = QualityClassifier(d_model) + clf.load_state_dict(torch.load(clf_path, map_location='cpu')) + clf.eval() + else: + print(" Training quality classifier...") + clf = train_quality_classifier( + hidden, quality, d_model=d_model, + epochs=30, batch_size=64, lr=1e-3, + save_path=clf_path) + clf.eval() + + # Step 3: guidance scale sweep + print("\n Guidance scale sweep (λ ∈ {0.0, 0.5, 1.0, 1.5, 2.0, 3.0})...") + n_sweep = min(80, int(task5_samples), len(src_list)) + results = sweep_guidance_scales( + model, clf, src_list[:n_sweep], ref_list[:n_sweep], + tgt_tok, scales=[0.0, 0.5, 1.0, 1.5, 2.0, 3.0], + n_samples=n_sweep, device=device, output_dir=OUTPUT_DIR) + + # Find optimal scale (quality + anti-collapse diversity) + def _score(s): + r = results[s] + return (r["mean_cer"] - 0.05 * r.get("diversity", 0.0)) + best_scale = min(results, key=_score) + print(f"\n Optimal guidance scale: λ={best_scale:.1f} " + f"CER={results[best_scale]['mean_cer']:.4f}") + + report = os.path.join(OUTPUT_DIR, "task5_report.txt") + with open(report, "w") as f: + f.write("TASK 5 — CLASSIFIER-FREE GUIDANCE\n" + "="*50 + "\n\n") + f.write(f"Classifier params: {sum(p.numel() for p in clf.parameters())}\n") + f.write(f"Training samples : {len(hidden)}\n\n") + f.write("Guidance scale sweep:\n") + f.write(f" {'λ':>6} {'CER':>8} {'diversity':>10} {'d2':>6} {'sBLEU':>8}\n") + f.write(" " + "-"*52 + "\n") + for s in sorted(results.keys()): + r = results[s] + marker = " ← optimal" if s == best_scale else "" + f.write( + f" {s:>6.1f} {r['mean_cer']:>8.4f} {r['diversity']:>10.3f} " + f"{r.get('distinct2', 0.0):>6.3f} {r.get('self_bleu', 0.0):>8.3f}{marker}\n" + ) + print(f" Report: {report}") + + +# ── Main ────────────────────────────────────────────────────────────── + +def main(): + global OUTPUT_DIR + + parser = argparse.ArgumentParser() + parser.add_argument("--task", + choices=["1","2","3","4","5","all"], default="all") + parser.add_argument("--input", + default="dharmo rakṣati rakṣitaḥ", + help="IAST input text for Task 2") + parser.add_argument("--phase", + choices=["generate_configs", "analyze"], default="analyze", + help="Task 4 phase: generate_configs (before training) or analyze (after)") + parser.add_argument("--checkpoint", default=None, + help="Optional explicit checkpoint path") + parser.add_argument("--output_dir", default="analysis/outputs", + help="Output directory for reports/figures") + parser.add_argument("--task4_samples", type=int, default=50, + help="Samples for Task 4 dry/full evaluation") + parser.add_argument("--task3_samples", type=int, default=500, + help="Samples for Task 3 hidden-state collection") + parser.add_argument("--task5_samples", type=int, default=500, + help="Samples for Task 5 classifier data + sweep") + args = parser.parse_args() + + OUTPUT_DIR = args.output_dir + os.makedirs(OUTPUT_DIR, exist_ok=True) + + cfg = copy.deepcopy(CONFIG) + if args.checkpoint: + cfg["model_type"] = infer_model_type_from_checkpoint(args.checkpoint) + cfg["data"]["include_negative_examples"] = infer_include_negative_from_checkpoint(args.checkpoint) + ckpt_name = os.path.basename(os.path.dirname(args.checkpoint)) + if ckpt_name.startswith("T") and ckpt_name[1:].isdigit(): + t_val = int(ckpt_name[1:]) + cfg["model"]["diffusion_steps"] = t_val + cfg["inference"]["num_steps"] = t_val + + requested = cfg["training"]["device"] + if requested == "mps" and not torch.backends.mps.is_available(): + requested = "cpu" + elif requested == "cuda" and not torch.cuda.is_available(): + requested = "cpu" + cfg["training"]["device"] = requested + device = torch.device(requested) + + print("Loading model and tokenizers...") + model, src_tok, tgt_tok, cfg = load_everything(cfg, device, ckpt_override=args.checkpoint) + + # Load val data for tasks that need corpus/context (Tasks 2, 3, 4, 5) + needs_data = args.task in ("2", "3", "4", "5", "all") + if needs_data: + print("Loading validation data...") + src_list, ref_list, inp_list = load_val_data(cfg, src_tok, tgt_tok, n=500) + else: + src_list, ref_list, inp_list = [], [], [] + + tasks = (["1","2","3","4","5"] if args.task == "all" + else [args.task]) + + for task in tasks: + if task == "1": + run_task1(model, src_tok, device) + elif task == "2": + run_task2(model, src_tok, tgt_tok, device, args.input, cfg, corpus_inputs=inp_list) + elif task == "3": + run_task3(model, src_tok, tgt_tok, device, src_list, ref_list, n_samples=args.task3_samples) + elif task == "4": + run_task4(args.phase, model, src_tok, tgt_tok, device, cfg, + src_list, ref_list, n_samples=args.task4_samples) + elif task == "5": + run_task5( + model, src_tok, tgt_tok, device, cfg, src_list, ref_list, + task5_samples=args.task5_samples + ) + + print(f"\n{'='*65}") + print(f" All outputs saved to: {OUTPUT_DIR}/") + print("="*65) + + +if __name__ == "__main__": + main() diff --git a/analysis/step_ablation.py b/analysis/step_ablation.py new file mode 100644 index 0000000000000000000000000000000000000000..f3be54adcd2d38419e6da2d86a751c6482fd2a4e --- /dev/null +++ b/analysis/step_ablation.py @@ -0,0 +1,640 @@ +# """ +# analysis/step_ablation.py +# ========================== +# Task 4: Semantic Robustness — Ablation of Diffusion Steps vs Meaning Preservation +# +# Two-phase workflow (retraining IS required for different T values): +# +# PHASE 1 — Generate configs + train (run once per T value): +# python analysis/step_ablation.py --phase generate_configs +# # Creates configs: ablation_configs/T4.py, T8.py, T16.py, T32.py, T64.py +# # Then train each: MODEL_TYPE=d3pm_cross_attention python train.py (for each config) +# +# PHASE 2 — Analyze trained models (no retraining needed): +# python analysis/step_ablation.py --phase analyze +# # Loads each trained model, generates 200 paraphrases, computes CER +# # Produces 3D plot: X=steps, Y=generation_speed, Z=CER +# +# Why retraining is needed: +# A model trained with T=128 learns to denoise from x_t~Uniform[0,128]. +# Running it with T=4 means the model only sees t∈{0,1,2,3} — which it +# was never trained on at those scales. Outputs are meaningless. +# You must train a separate model for each T value. +# +# Also implements adversarial robustness test (no retraining): +# Takes your existing T=128 model and tests whether corrupted IAST +# inputs (typos, character swaps) cause proportional output degradation. +# """ +# +# import torch +# import torch.nn.functional as F +# import numpy as np +# import os +# import sys +# import time +# import json +# import copy +# from typing import List, Dict, Optional +# +# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +# +# +# # ── Phase 1: Config generation ──────────────────────────────────────── +# +# T_VALUES = [4, 8, 16, 32, 64] +# +# def generate_ablation_configs(base_config_path: str = "config.py", +# output_dir: str = "ablation_configs"): +# """ +# Generate one config file per T value. +# Each config is a copy of the base config with diffusion_steps changed. +# +# After running this, train each model: +# for T in 4 8 16 32 64; do +# cp ablation_configs/config_T${T}.py config.py +# python train.py +# mv results7/d3pm_cross_attention_neg_False \ +# ablation_results/T${T} +# done +# """ +# os.makedirs(output_dir, exist_ok=True) +# +# # Read base config +# with open(base_config_path, "r") as f: +# base_src = f.read() +# +# for T in T_VALUES: +# # Replace diffusion_steps and num_steps +# cfg_src = base_src +# cfg_src = cfg_src.replace( +# '"diffusion_steps": 128', +# f'"diffusion_steps": {T}' +# ) +# cfg_src = cfg_src.replace( +# "'diffusion_steps': 128", +# f"'diffusion_steps': {T}" +# ) +# cfg_src = cfg_src.replace( +# '"num_steps": 128', +# f'"num_steps": {T}' +# ) +# cfg_src = cfg_src.replace( +# "'num_steps': 128", +# f"'num_steps': {T}" +# ) +# out_path = os.path.join(output_dir, f"config_T{T}.py") +# with open(out_path, "w") as f: +# f.write(f"# Ablation config: T={T} diffusion steps\n") +# f.write(cfg_src) +# print(f" Wrote: {out_path}") +# +# # Write a shell script to train all +# shell_script = os.path.join(output_dir, "train_all.sh") +# with open(shell_script, "w") as f: +# f.write("#!/bin/bash\n") +# f.write("# Run this script to train all ablation models\n\n") +# for T in T_VALUES: +# f.write(f"echo '=== Training T={T} ==='\n") +# f.write(f"cp {output_dir}/config_T{T}.py config.py\n") +# f.write(f"python train.py\n") +# f.write(f"mkdir -p ablation_results/T{T}\n") +# f.write(f"cp -r results7/d3pm_cross_attention_neg_False/best_model.pt " +# f"ablation_results/T{T}/best_model.pt\n") +# f.write(f"cp -r results7/d3pm_cross_attention_neg_False/train.log " +# f"ablation_results/T{T}/train.log\n\n") +# os.chmod(shell_script, 0o755) +# print(f"\nTraining script: {shell_script}") +# print(f"Run: bash {shell_script}") +# +# +# # ── Phase 2: Analysis (after models are trained) ────────────────────── +# +# def compute_cer(pred: str, ref: str) -> float: +# if not ref: +# return 1.0 +# +# def edit_distance(s1, s2): +# m, n = len(s1), len(s2) +# dp = list(range(n + 1)) +# for i in range(1, m + 1): +# prev, dp[0] = dp[0], i +# for j in range(1, n + 1): +# temp = dp[j] +# dp[j] = prev if s1[i-1] == s2[j-1] else 1 + min(prev, dp[j], dp[j-1]) +# prev = temp +# return dp[n] +# +# return edit_distance(pred, ref) / max(len(ref), 1) +# +# +# def evaluate_model( +# model, +# src_list: List[torch.Tensor], +# ref_list: List[str], +# tgt_tokenizer, +# n_samples: int = 200, +# temperature: float = 0.8, +# top_k: int = 40, +# ) -> Dict: +# """ +# Generate n_samples outputs and compute CER + generation speed. +# +# Returns dict with: +# mean_cer : average CER over samples +# generation_s : total wall-clock seconds for all generations +# speed_per_sample: seconds per sample +# cer_list : per-sample CER values +# """ +# device = next(model.parameters()).device +# n = min(n_samples, len(src_list)) +# cer_list = [] +# +# start = time.perf_counter() +# for i, (src, ref) in enumerate(zip(src_list[:n], ref_list[:n])): +# if src.dim() == 1: +# src = src.unsqueeze(0) +# +# with torch.no_grad(): +# if hasattr(model.model, 'generate_cached'): +# out = model.model.generate_cached( +# src.to(device), temperature=temperature, top_k=top_k +# ) +# else: +# out = model.generate( +# src.to(device), temperature=temperature, top_k=top_k +# ) +# +# ids = [x for x in out[0].tolist() if x > 4] +# pred = tgt_tokenizer.decode(ids).strip() +# cer = compute_cer(pred, ref) +# cer_list.append(cer) +# +# elapsed = time.perf_counter() - start +# +# return { +# "mean_cer": float(np.mean(cer_list)), +# "std_cer": float(np.std(cer_list)), +# "generation_s": elapsed, +# "speed_per_sample": elapsed / max(n, 1), +# "cer_list": cer_list, +# "n_samples": n, +# } +# +# +# def run_ablation_analysis( +# ablation_dir: str = "ablation_results", +# base_cfg: dict = None, +# src_list: List[torch.Tensor] = None, +# ref_list: List[str] = None, +# tgt_tokenizer = None, +# device: torch.device = None, +# output_dir: str = "analysis/outputs", +# ) -> Dict: +# """ +# Load each trained model and evaluate. +# Produces results dict and 3D plot. +# +# Expects ablation_results/T{N}/best_model.pt for each T in T_VALUES. +# """ +# from inference import load_model +# +# results = {} +# for T in T_VALUES: +# ckpt = os.path.join(ablation_dir, f"T{T}", "best_model.pt") +# if not os.path.exists(ckpt): +# print(f" SKIP T={T}: no checkpoint at {ckpt}") +# continue +# +# print(f"\nEvaluating T={T}...") +# cfg_T = copy.deepcopy(base_cfg) +# cfg_T['model']['diffusion_steps'] = T +# cfg_T['inference']['num_steps'] = T +# +# model, cfg_T = load_model(ckpt, cfg_T, device) +# model.eval() +# +# metrics = evaluate_model( +# model, src_list, ref_list, tgt_tokenizer, n_samples=200 +# ) +# results[T] = metrics +# print(f" T={T} CER={metrics['mean_cer']:.4f} " +# f"speed={metrics['speed_per_sample']:.3f}s/sample") +# +# del model +# +# # Save results +# os.makedirs(output_dir, exist_ok=True) +# results_path = os.path.join(output_dir, "ablation_results.json") +# with open(results_path, "w") as f: +# json.dump({str(k): {kk: vv for kk, vv in v.items() if kk != 'cer_list'} +# for k, v in results.items()}, f, indent=2) +# print(f"\nResults saved: {results_path}") +# +# return results +# +# +# def plot_ablation_3d( +# results: Dict, +# save_path: Optional[str] = None, +# ): +# """ +# 3D plot: X=diffusion_steps, Y=generation_speed(s/sample), Z=CER. +# Also produces a 2D summary plot. +# """ +# try: +# import matplotlib.pyplot as plt +# from mpl_toolkits.mplot3d import Axes3D +# except ImportError: +# print("pip install matplotlib.") +# return +# +# T_list = sorted(results.keys()) +# cers = [results[T]["mean_cer"] for T in T_list] +# speeds = [results[T]["speed_per_sample"] for T in T_list] +# +# # ── 3D plot ─────────────────────────────────────────────────────── +# fig = plt.figure(figsize=(14, 5)) +# +# ax3d = fig.add_subplot(121, projection='3d') +# ax3d.scatter(T_list, speeds, cers, c=cers, cmap='RdYlGn_r', s=80) +# for T, s, c in zip(T_list, speeds, cers): +# ax3d.text(T, s, c, f"T={T}", fontsize=8) +# ax3d.set_xlabel("Diffusion steps T", fontsize=9) +# ax3d.set_ylabel("Speed (s/sample)", fontsize=9) +# ax3d.set_zlabel("CER (↓ better)", fontsize=9) +# ax3d.set_title("T vs speed vs CER", fontsize=10) +# +# # ── 2D CER vs T (find the knee) ────────────────────────────────── +# ax2d = fig.add_subplot(122) +# ax2d.plot(T_list, cers, 'o-', linewidth=1.8, color='coral', markersize=7) +# for T, c in zip(T_list, cers): +# ax2d.annotate(f"{c:.3f}", (T, c), textcoords="offset points", +# xytext=(0, 8), fontsize=8, ha='center') +# +# # Find knee: largest CER drop per unit T (elbow method) +# if len(T_list) >= 3: +# drops = [cers[i] - cers[i+1] for i in range(len(cers)-1)] +# knee_i = int(np.argmax(drops)) +# knee_T = T_list[knee_i + 1] +# ax2d.axvline(knee_T, color='steelblue', linestyle='--', linewidth=1.2, +# label=f"Knee at T={knee_T}") +# ax2d.legend(fontsize=9) +# +# ax2d.set_xlabel("Diffusion steps T", fontsize=10) +# ax2d.set_ylabel("CER (lower = better)", fontsize=10) +# ax2d.set_title("CER vs diffusion steps", fontsize=10) +# ax2d.set_ylim(0, max(cers) * 1.1) +# +# plt.tight_layout() +# if save_path: +# os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True) +# plt.savefig(save_path, dpi=150, bbox_inches='tight') +# print(f"Saved: {save_path}") +# else: +# plt.show() +# plt.close() +# +# +# # ── Adversarial robustness test (no retraining needed) ─────────────── +# +# def corrupt_iast(text: str, corruption_rate: float = 0.05) -> str: +# """ +# Introduce random corruption into IAST text: +# - Character swap (adjacent chars swapped) +# - Character deletion +# - Random character insertion +# +# Models rate as 5% to 20% corruption to test robustness. +# """ +# import random +# chars = list(text) +# n_corrupt = max(1, int(len(chars) * corruption_rate)) +# +# for _ in range(n_corrupt): +# op = random.choice(['swap', 'delete', 'insert']) +# pos = random.randint(0, len(chars) - 1) +# +# if op == 'swap' and pos < len(chars) - 1: +# chars[pos], chars[pos+1] = chars[pos+1], chars[pos] +# elif op == 'delete' and len(chars) > 1: +# chars.pop(pos) +# elif op == 'insert': +# chars.insert(pos, random.choice('abcdeimnostu')) +# +# return "".join(chars) +# +# +# @torch.no_grad() +# def run_adversarial_test( +# model, +# src_tokenizer, +# tgt_tokenizer, +# test_inputs: List[str], +# test_refs: List[str], +# corruption_rates: List[float] = [0.0, 0.05, 0.10, 0.15, 0.20], +# device: torch.device = None, +# output_dir: str = "analysis/outputs", +# ) -> Dict: +# """ +# Test if CER degrades proportionally with IAST corruption. +# Uses existing trained model — no retraining. +# """ +# device = device or next(model.parameters()).device +# results = {} +# +# print("\nAdversarial robustness test...") +# for rate in corruption_rates: +# cer_list = [] +# for text, ref in zip(test_inputs, test_refs): +# corrupted = corrupt_iast(text, rate) +# ids = src_tokenizer.encode(corrupted) +# src = torch.tensor([ids], dtype=torch.long, device=device) +# +# if hasattr(model.model, 'generate_cached'): +# out = model.model.generate_cached(src) +# else: +# out = model.generate(src) +# +# pred_ids = [x for x in out[0].tolist() if x > 4] +# pred = tgt_tokenizer.decode(pred_ids).strip() +# cer_list.append(compute_cer(pred, ref)) +# +# mean_cer = float(np.mean(cer_list)) +# results[rate] = mean_cer +# print(f" corruption={rate*100:.0f}% → CER={mean_cer:.4f}") +# +# # Save + plot +# os.makedirs(output_dir, exist_ok=True) +# try: +# import matplotlib.pyplot as plt +# fig, ax = plt.subplots(figsize=(8, 4)) +# rates = [r * 100 for r in corruption_rates] +# cers = [results[r] for r in corruption_rates] +# ax.plot(rates, cers, 'o-', linewidth=1.8, color='steelblue', markersize=7) +# ax.set_xlabel("IAST corruption rate (%)", fontsize=11) +# ax.set_ylabel("CER", fontsize=11) +# ax.set_title("Model robustness to IAST input corruption", fontsize=11) +# ax.set_ylim(0, max(cers) * 1.2) +# plt.tight_layout() +# plt.savefig(os.path.join(output_dir, "adversarial_robustness.png"), +# dpi=150, bbox_inches='tight') +# plt.close() +# print(f" Saved: {output_dir}/adversarial_robustness.png") +# except ImportError: +# pass +# +# with open(os.path.join(output_dir, "adversarial_results.json"), "w") as f: +# json.dump({str(k): v for k, v in results.items()}, f, indent=2) +# +# return results +""" +analysis/task4_pipeline.py +================================ +Correct Task 4 Pipeline: + +PHASE 1 → Evaluate all models +PHASE 2 → Analyze + detect optimal T + +NO early decision making. +""" + +import torch +import numpy as np +import time +import os +import json +from typing import Dict, List +from difflib import SequenceMatcher +from collections import Counter + + +# ───────────────────────────────────────────── +# Load Metrics +# ───────────────────────────────────────────── + +def load_metrics(): + try: + from bert_score import score as bert_score + except Exception: + bert_score = None + from nltk.translate.bleu_score import sentence_bleu + try: + from sentence_transformers import SentenceTransformer, util + st_model = SentenceTransformer('all-MiniLM-L6-v2') + return bert_score, st_model, util, sentence_bleu + except Exception: + # Offline-safe fallback: skip sentence-transformer similarity. + return bert_score, None, None, sentence_bleu + + +# ───────────────────────────────────────────── +# PHASE 1 — Evaluate ALL models +# ───────────────────────────────────────────── + +def evaluate_all_models(models: Dict[int, object], + src_list, + ref_list, + tgt_tokenizer, + n_samples=200, + output_dir: str = "analysis/outputs"): + + bert_score_fn, st_model, util, bleu_fn = load_metrics() + + results = {} + + print("\n=== PHASE 1: Evaluating ALL models ===") + + for T, model in sorted(models.items()): + print(f"\nEvaluating T={T}...") + + device = next(model.parameters()).device + preds, refs = [], [] + + start = time.perf_counter() + + for src, ref in zip(src_list[:n_samples], ref_list[:n_samples]): + if src.dim() == 1: + src = src.unsqueeze(0) + + with torch.no_grad(): + if hasattr(model, "model") and hasattr(model.model, "generate_cached"): + out = model.model.generate_cached(src.to(device)) + else: + # Fallback for wrappers that only expose top-level generate. + out = model.generate(src.to(device)) + + ids = [x for x in out[0].tolist() if x > 4] + pred = tgt_tokenizer.decode(ids).strip() + + preds.append(pred) + refs.append(ref) + + elapsed = time.perf_counter() - start + + # BERTScore (fallback to lexical similarity if unavailable/offline) + try: + if bert_score_fn is not None: + _, _, F1 = bert_score_fn(preds, refs, lang="hi", verbose=False) + bert_f1 = float(F1.mean()) + else: + raise RuntimeError("bertscore unavailable") + except Exception: + bert_f1 = float(np.mean([SequenceMatcher(None, p, r).ratio() for p, r in zip(preds, refs)])) + + # Sentence similarity (distinct from BERT fallback) + if st_model is not None: + emb_p = st_model.encode(preds, convert_to_tensor=True) + emb_r = st_model.encode(refs, convert_to_tensor=True) + sim = util.cos_sim(emb_p, emb_r).diagonal().mean().item() + else: + # token-overlap F1 proxy (different behavior from char-level similarity) + f1s = [] + for p, r in zip(preds, refs): + pt = [t for t in p.split() if t] + rt = [t for t in r.split() if t] + if not pt or not rt: + f1s.append(0.0) + continue + cp, cr = Counter(pt), Counter(rt) + inter = sum((cp & cr).values()) + prec = inter / max(1, len(pt)) + rec = inter / max(1, len(rt)) + f1s.append((2 * prec * rec / max(1e-9, prec + rec))) + sim = float(np.mean(f1s)) if f1s else 0.0 + if not np.isfinite(sim): + sim = float(np.mean([SequenceMatcher(None, p, r).ratio() for p, r in zip(preds, refs)])) + + # BLEU + bleu_scores = [ + bleu_fn([r.split()], p.split()) + for p, r in zip(preds, refs) + ] + + results[T] = { + "bertscore_f1": bert_f1, + "semantic_sim": sim, + "bleu": float(np.mean(bleu_scores)), + "speed_per_sample": elapsed / max(1, len(preds)) + } + + print(f" BERTScore: {bert_f1:.4f}") + print(f" Sim: {sim:.4f}") + print(f" BLEU: {results[T]['bleu']:.4f}") + print(f" Speed: {results[T]['speed_per_sample']:.4f}s") + + # Save raw results + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, "task4_raw_results.json"), "w") as f: + json.dump(results, f, indent=2) + + return results + + +# ───────────────────────────────────────────── +# PHASE 2 — Analyze results (Knee Detection) +# ───────────────────────────────────────────── + +def analyze_results(results: Dict): + print("\n=== PHASE 2: Analysis ===") + + T_list = sorted(results.keys()) + scores = [results[T]["bertscore_f1"] for T in T_list] + + gains = [scores[i+1] - scores[i] for i in range(len(scores)-1)] + + print("\nMarginal Gains:") + for i, g in enumerate(gains): + print(f" T{T_list[i]} → T{T_list[i+1]}: +{g:.4f}") + + # Robust utility selection (quality + semantics + speed regularizer) + bvals = np.array([results[T]["bertscore_f1"] for T in T_list], dtype=np.float32) + svals = np.array([results[T]["semantic_sim"] for T in T_list], dtype=np.float32) + tvals = np.array([results[T]["speed_per_sample"] for T in T_list], dtype=np.float32) + b_norm = (bvals - bvals.min()) / max(1e-9, (bvals.max() - bvals.min())) + s_norm = (svals - svals.min()) / max(1e-9, (svals.max() - svals.min())) + t_norm = (tvals - tvals.min()) / max(1e-9, (tvals.max() - tvals.min())) + utility = 0.50 * b_norm + 0.30 * s_norm - 0.20 * t_norm + knee_T = T_list[int(np.argmax(utility))] + + print(f"\n✅ Optimal T (semantic-speed tradeoff): {knee_T}") + + return knee_T, gains + + +# ───────────────────────────────────────────── +# 3D Plot (BERTScore) +# ───────────────────────────────────────────── + +def plot_3d(results, output_dir: str = "analysis/outputs"): + import matplotlib.pyplot as plt + from mpl_toolkits.mplot3d import Axes3D + + T_list = sorted(results.keys()) + + X = T_list + Y = [results[T]["speed_per_sample"] for T in T_list] + Z = [results[T]["bertscore_f1"] for T in T_list] + + fig = plt.figure(figsize=(10, 6)) + ax = fig.add_subplot(111, projection='3d') + + ax.scatter(X, Y, Z) + + for x, y, z in zip(X, Y, Z): + ax.text(x, y, z, f"T={x}", fontsize=8) + + ax.set_xlabel("Diffusion Steps") + ax.set_ylabel("Speed") + ax.set_zlabel("BERTScore") + + plt.title("3D Tradeoff: Steps vs Speed vs Quality") + + os.makedirs(output_dir, exist_ok=True) + plt.savefig(os.path.join(output_dir, "task4_3d.png")) + plt.close() + + print("Saved 3D plot") + + +# ───────────────────────────────────────────── +# FINAL RUNNER +# ───────────────────────────────────────────── + +def run_task4(models, src_list, ref_list, tgt_tokenizer, + output_dir: str = "analysis/outputs", n_samples: int = 200): + + # Phase 1: Evaluate all + results = evaluate_all_models( + models, src_list, ref_list, tgt_tokenizer, n_samples=n_samples, output_dir=output_dir + ) + + # Phase 2: Analyze + knee_T, gains = analyze_results(results) + + # Plot + plot_3d(results, output_dir=output_dir) + + # Save detailed report + report_path = os.path.join(output_dir, "task4_report.txt") + with open(report_path, "w") as f: + f.write("TASK 4 — SEMANTIC ROBUSTNESS ABLATION\n") + f.write("=" * 50 + "\n\n") + f.write(f"Optimal diffusion steps = {knee_T}\n\n") + f.write(f"{'T':>6} {'BERT-F1':>10} {'SEM_SIM':>10} {'BLEU':>8} {'sec/sample':>12}\n") + f.write(" " + "-" * 56 + "\n") + for T in sorted(results.keys()): + r = results[T] + f.write( + f"{T:>6} {r['bertscore_f1']:>10.4f} {r['semantic_sim']:>10.4f} " + f"{r['bleu']:>8.4f} {r['speed_per_sample']:>12.4f}\n" + ) + f.write("\nMarginal gains (BERT-F1):\n") + for i, g in enumerate(gains): + t0 = sorted(results.keys())[i] + t1 = sorted(results.keys())[i + 1] + f.write(f" T{t0} -> T{t1}: {g:+.4f}\n") + f.write("\nSaved plots/files:\n") + f.write(" - task4_3d.png\n") + f.write(" - task4_raw_results.json\n") + + return knee_T diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task1_encoder_cost.png b/analysis_outputs/outputs_all_models_20260325/T16/task1_encoder_cost.png new file mode 100644 index 0000000000000000000000000000000000000000..a32e1133ccfb88d6065d6364952fd2ce19e62f20 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task1_encoder_cost.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task1_kv_cache.txt b/analysis_outputs/outputs_all_models_20260325/T16/task1_kv_cache.txt new file mode 100644 index 0000000000000000000000000000000000000000..95721b0b99d054c1e8adfbe4c16fd26e89cd811a --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task1_kv_cache.txt @@ -0,0 +1,15 @@ +TASK 1 — KV CACHE BENCHMARK +======================================== + +has_generate_cached=True +memory_profile=Torch CPU mem-event reduction: 30.4% @ src_len=64 (std=2143.0MB, cache=1492.1MB) + + src_len standard(s) cached(s) speedup encoder% + 16 0.893 0.571 1.56x 40.0% + 32 0.751 0.509 1.48x 42.3% + 64 1.141 0.822 1.39x 40.7% + +Saved graphs: + - task1_time_comparison.png + - task1_speedup.png + - task1_encoder_cost.png diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task1_speedup.png b/analysis_outputs/outputs_all_models_20260325/T16/task1_speedup.png new file mode 100644 index 0000000000000000000000000000000000000000..6454abe912fa5025d7a8aff1353caf44d577406b Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task1_speedup.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task1_time_comparison.png b/analysis_outputs/outputs_all_models_20260325/T16/task1_time_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..5e0841c9992266c550d959bba6f486a184be77db Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task1_time_comparison.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_all_layers_t0.png b/analysis_outputs/outputs_all_models_20260325/T16/task2_all_layers_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..b085150ec4fbf3f4bc940c6821973ba15e705fb8 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task2_all_layers_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_evolution.png b/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_evolution.png new file mode 100644 index 0000000000000000000000000000000000000000..c6e90fb66c58406976bcfef9ddbffe606f8fca28 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_evolution.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_t0.png b/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..3607144ccfd8e55b2edcb80a4d5153fa0295743f Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_t15.png b/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_t15.png new file mode 100644 index 0000000000000000000000000000000000000000..9e165a517ca411b17bccff45b80b5909154ab48d Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task2_attn_t15.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_report.txt b/analysis_outputs/outputs_all_models_20260325/T16/task2_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..d637ae154601e3cd4f3bab60cb8bbfa23a91e630 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task2_report.txt @@ -0,0 +1,35 @@ +TASK 2 — ATTENTION + DRIFT REPORT +================================================== + +Input : dharmo rakṣati rakṣitaḥ +Output: धर्मो रक्षति रक्षितः + +Captured steps: 16 +Analysis quality: WEAK +Final output uniq-ratio: 1.000 +Degenerate output: NO +Multi-sample semantic score (n<=8): 0.1471 +Lock-in step (CER<=0.05): t=0 +Locked tokens: 38 Flexible tokens: 42 +TF-IDF vs attention stability corr: 0.9294 +TF-IDF status: OK + +Saved graphs: + - task2_attn_t*.png / task2_all_layers_t0.png + - task2_attn_evolution.png + - task2_semantic_drift.png + - task2_source_alignment.png + - task2_tfidf_vs_attention.png + +Step trajectory (first 10 rows) +------------------------------------------------------------ +t= 15 bert=0.0475 drift=0.9525 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 14 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 13 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 12 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 11 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 10 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 9 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 8 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 7 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् +t= 6 bert=0.0478 drift=0.9522 text=धर्मो ति रक्ष रक्षि तः तः तः तः ितः तः धर्मो धर्मो धर्मो धर् diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_semantic_drift.png b/analysis_outputs/outputs_all_models_20260325/T16/task2_semantic_drift.png new file mode 100644 index 0000000000000000000000000000000000000000..7eff7f445e835a2c13e2c7d2e70e87a07ec29b41 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task2_semantic_drift.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_source_alignment.png b/analysis_outputs/outputs_all_models_20260325/T16/task2_source_alignment.png new file mode 100644 index 0000000000000000000000000000000000000000..7a3720601610f2621482b2426a02eabb1e0c559d Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task2_source_alignment.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task2_tfidf_vs_attention.png b/analysis_outputs/outputs_all_models_20260325/T16/task2_tfidf_vs_attention.png new file mode 100644 index 0000000000000000000000000000000000000000..e5a6b7c5ec0c35aa2a17f8ccf2459267a9e0e3db Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task2_tfidf_vs_attention.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task3_concept_space.png b/analysis_outputs/outputs_all_models_20260325/T16/task3_concept_space.png new file mode 100644 index 0000000000000000000000000000000000000000..c449bbe1580b2e2c1534083c387c35aad0bd4a0b Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task3_concept_space.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task3_diversity_curve.png b/analysis_outputs/outputs_all_models_20260325/T16/task3_diversity_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..a9bc33e7ffbe5ca5803b0d0d057fcf9fa824fc74 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task3_diversity_curve.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task3_diversity_direction.npy b/analysis_outputs/outputs_all_models_20260325/T16/task3_diversity_direction.npy new file mode 100644 index 0000000000000000000000000000000000000000..46e6666631bf1badb8034f2790b031f8211c7e65 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task3_diversity_direction.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:250b81c1f8cc9537240873d00539df1e8a30e6c07b260d4c05df23fb32c704d6 +size 4224 diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task3_pca_explained_variance.png b/analysis_outputs/outputs_all_models_20260325/T16/task3_pca_explained_variance.png new file mode 100644 index 0000000000000000000000000000000000000000..cfc0403c4db80a0fea44d8e48554f3fb1ca6f463 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task3_pca_explained_variance.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task3_report.txt b/analysis_outputs/outputs_all_models_20260325/T16/task3_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cfe77248019a83349d2d05d18e27f097f372d10 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task3_report.txt @@ -0,0 +1,21 @@ +TASK 3 — CONCEPT VECTORS + PCA STEERING +================================================== + +PCA: 50 components, 74.8% variance +Diversity PC: 0 (|r|=0.325 with diversity proxy) + +Direction validity: WEAK +Spectrum unique ratio (mean over 5 seeds): 1.000 +Spectrum semantic stability (mean over 5 seeds): 0.312 + +Saved graphs: + - task3_concept_space.png + - task3_pca_explained_variance.png + - task3_diversity_curve.png + +Diversity spectrum: + alpha=-2.0 → बले वेध विवर् धान वीर्य वीर्य धिं सिंहा भि̱ सन वस्तु वेध वै वेध वस्तु सन सन सिंहा सिंहा वीर्य वीर्य वस्तु सन रुते प्रभवति मन वेध बले बले र्वृ प्रपूजयेत् युगा मलि धान तुल वीर्य वीर्य वीर्य वीर्य वीर्य वीर्य धान तुल कालेन युगा वेध बले वेध वेध च्छे ष्मस् यस्या काष्ठा ज्ञप्त अर्णव धिं धिं वस्तु धिं सन तया सन सन देवाः देवाः स्वातन्त्र अर्णव मह वस्तु मुष् सन धिं धिं धिं विक्र त्र मह हस्ते च्छे मह + alpha=-1.0 → बले र् अ तुल वीर्य वीर्य गुरु सिंहा सन सन विलेप वै वै वै गतस्य वेध सन सिंहा सिंहा स्य स्य । सन वै वै वै बले बले बले बले र् अ अ तुल तुल वीर्य वीर्य वीर्य वीर्य वीर्य वीर्य तुल तुल तुल ् बले वेध दिव्यां मान वै अप्सु सन ॥ ॥ वस्तु सिंहा सन सन विक्र सन स काष्ठा सन सन सन कार सन सन सन सन भ बल ु सिंहा सन सिंहा सन म् म् सन + alpha=+0.0 → बले र् अ तुल वीर्य वीर्य स्य सिंहा सन सन पितो वै वै वै दक्षिणां सन सन सिंहा सिंहा स्य स्य स्य सन गतस्य वै वै ॥ बले बले र् र् अ अ । तुल वीर्य वीर्य वीर्य वीर्य वीर्य तुल तुल तुल तुल अ स बले बले वै वै ॥ ॥ ॥ सन सन सिंहा स सन सन सन सन सन सन सन सन सन सन ॥ ॥ सन सन शतैः ॥ सिंहा सिंहा द सिंहा सन त् सन + alpha=+1.0 → बले र् अ अ विशुद्धं स्य स्य सिंहा सिंहा सन गतस्य वै वै वै वेत्ति सन सन सिंहा स्य स्य स्य स्य सन वै वै स मल बले बले र् र् व अ अ तुल वीर्य वीर्य वीर्य स्य वीर्य स्य तुल ानु अ अ । र् व ॥ वै वै सन द ॥ ॥ सिंहा सिंहा ॥ सं सन ॥ ॥ व ॥ ॥ हेम सन सन व ॥ ै ॥ वै भ न न ॥ मित्रो सिंहा सन + alpha=+2.0 → आविश र् अ किंचिद् वर स्य स्य सिंहा सं निमे ञ् सं वै वै ञ् सन कृपा सिंहा स्य स्य स्य स्य फणा ञ् वै ौ जिह्व बले मानाः र् र् वराय अ माने वर विशुद्धं स्य स्य स्य – वर विशुद्धं व वर अ कृपा ॥ परम् ॥ कश्चि वै ॥ ञ् ञ् सं स्य स्य तम् व प्रवर्तन्ते कर्मसु परम् वर ते ॥ व ञ् ॥ ॥ सं द ॥ ॥ वर न्द ̱व ॥ व व ै diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task4_3d.png b/analysis_outputs/outputs_all_models_20260325/T16/task4_3d.png new file mode 100644 index 0000000000000000000000000000000000000000..d8cf6a1dc2a9b42deb7f272060e35b5c21d43cf8 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task4_3d.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task4_raw_results.json b/analysis_outputs/outputs_all_models_20260325/T16/task4_raw_results.json new file mode 100644 index 0000000000000000000000000000000000000000..88024b154977f902c39e0bb4087cfdadaec6305c --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task4_raw_results.json @@ -0,0 +1,8 @@ +{ + "16": { + "bertscore_f1": 0.25743605086845023, + "semantic_sim": 0.05798209163692987, + "bleu": 0.0007454091523007641, + "speed_per_sample": 0.9068318999983603 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task4_report.txt b/analysis_outputs/outputs_all_models_20260325/T16/task4_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a3a0b3ce4836085cd97e6cc68429da95eafd4c3 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task4_report.txt @@ -0,0 +1,14 @@ +TASK 4 — SEMANTIC ROBUSTNESS ABLATION +================================================== + +Optimal diffusion steps = 16 + + T BERT-F1 SEM_SIM BLEU sec/sample + -------------------------------------------------------- + 16 0.2574 0.0580 0.0007 0.9068 + +Marginal gains (BERT-F1): + +Saved plots/files: + - task4_3d.png + - task4_raw_results.json diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task5_guidance_results.json b/analysis_outputs/outputs_all_models_20260325/T16/task5_guidance_results.json new file mode 100644 index 0000000000000000000000000000000000000000..877149ad91d50e0ce6924efc18812f9027a1d2ec --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task5_guidance_results.json @@ -0,0 +1,44 @@ +{ + "0.0": { + "mean_cer": 0.8335914296177765, + "diversity": 0.8084225118773136, + "sent_unique": 1.0, + "distinct2": 0.6240506329113924, + "self_bleu": 0.00720560915676511 + }, + "0.5": { + "mean_cer": 0.8361858372849987, + "diversity": 0.7997218378718688, + "sent_unique": 1.0, + "distinct2": 0.6060126582278481, + "self_bleu": 0.0065689824841105166 + }, + "1.0": { + "mean_cer": 0.8390361847911715, + "diversity": 0.7978319711295725, + "sent_unique": 1.0, + "distinct2": 0.6009493670886076, + "self_bleu": 0.005285424829462745 + }, + "1.5": { + "mean_cer": 0.8457771777829102, + "diversity": 0.8134699633307632, + "sent_unique": 1.0, + "distinct2": 0.6306962025316456, + "self_bleu": 0.0037562758701191663 + }, + "2.0": { + "mean_cer": 0.8530737908495466, + "diversity": 0.828318481566094, + "sent_unique": 1.0, + "distinct2": 0.6604430379746835, + "self_bleu": 0.003806074842495409 + }, + "3.0": { + "mean_cer": 0.8772574230238586, + "diversity": 0.829961794478179, + "sent_unique": 1.0, + "distinct2": 0.6686708860759494, + "self_bleu": 0.008747297119591432 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_classifier.pt b/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_classifier.pt new file mode 100644 index 0000000000000000000000000000000000000000..d969ea078314d0c1720f088b83f21a2958ede1a2 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_classifier.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4053d24514f08c475662f69a5b01d1577cd1f79837df69ac2175705310e9a23 +size 561505 diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_data.npz b/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_data.npz new file mode 100644 index 0000000000000000000000000000000000000000..29ea0ca227ef29ef8c2990ea7052aafd4a32d352 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_data.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:840494704113872c8e53e3627e5666c8af940ed683515ec37894dd3091a14684 +size 164512 diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_diversity_tradeoff.png b/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_diversity_tradeoff.png new file mode 100644 index 0000000000000000000000000000000000000000..8de05cc37c892fb302e7f689dbb95bce46343fd9 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T16/task5_quality_diversity_tradeoff.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T16/task5_report.txt b/analysis_outputs/outputs_all_models_20260325/T16/task5_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..3afabe9dbc4932c05ad098afb13d5da4c3b7851a --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T16/task5_report.txt @@ -0,0 +1,15 @@ +TASK 5 — CLASSIFIER-FREE GUIDANCE +================================================== + +Classifier params: 139521 +Training samples : 40 + +Guidance scale sweep: + λ CER diversity d2 sBLEU + ---------------------------------------------------- + 0.0 0.8336 0.808 0.624 0.007 ← optimal + 0.5 0.8362 0.800 0.606 0.007 + 1.0 0.8390 0.798 0.601 0.005 + 1.5 0.8458 0.813 0.631 0.004 + 2.0 0.8531 0.828 0.660 0.004 + 3.0 0.8773 0.830 0.669 0.009 diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task1_encoder_cost.png b/analysis_outputs/outputs_all_models_20260325/T32/task1_encoder_cost.png new file mode 100644 index 0000000000000000000000000000000000000000..beabe21b52785f2346cdb6796708dadbdb3dd3c1 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task1_encoder_cost.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task1_kv_cache.txt b/analysis_outputs/outputs_all_models_20260325/T32/task1_kv_cache.txt new file mode 100644 index 0000000000000000000000000000000000000000..510621ce6388c9466dfbfa6271469b125a9fc475 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task1_kv_cache.txt @@ -0,0 +1,15 @@ +TASK 1 — KV CACHE BENCHMARK +======================================== + +has_generate_cached=True +memory_profile=Torch CPU mem-event reduction: 31.1% @ src_len=64 (std=4287.2MB, cache=2953.9MB) + + src_len standard(s) cached(s) speedup encoder% + 16 1.914 1.165 1.64x 39.6% + 32 1.542 0.891 1.73x 42.1% + 64 2.096 1.475 1.42x 42.7% + +Saved graphs: + - task1_time_comparison.png + - task1_speedup.png + - task1_encoder_cost.png diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task1_speedup.png b/analysis_outputs/outputs_all_models_20260325/T32/task1_speedup.png new file mode 100644 index 0000000000000000000000000000000000000000..d701f46f693d47959b90cbf538801fae6de6a2d0 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task1_speedup.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task1_time_comparison.png b/analysis_outputs/outputs_all_models_20260325/T32/task1_time_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..6e1e14f82e7bebed60da2e0ac2d20b710d7f4676 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task1_time_comparison.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_all_layers_t0.png b/analysis_outputs/outputs_all_models_20260325/T32/task2_all_layers_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..05b2cb3a226f03d905e11a7f1c951a598efa6094 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task2_all_layers_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_evolution.png b/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_evolution.png new file mode 100644 index 0000000000000000000000000000000000000000..dc3aa084336565121cac3cf27617c6da4cf497a5 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_evolution.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_t0.png b/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..e2dff1aa8f4a77523c60b61e77297f78ca6c5c31 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_t31.png b/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_t31.png new file mode 100644 index 0000000000000000000000000000000000000000..5c3ea77462aaad25a15ca06a26d14d5301fc41bc Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task2_attn_t31.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_report.txt b/analysis_outputs/outputs_all_models_20260325/T32/task2_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..94070c289d3318452528896041a50de74aceb455 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task2_report.txt @@ -0,0 +1,35 @@ +TASK 2 — ATTENTION + DRIFT REPORT +================================================== + +Input : dharmo rakṣati rakṣitaḥ +Output: धर्मो रक्षति रक्षितः + +Captured steps: 32 +Analysis quality: WEAK +Final output uniq-ratio: 1.000 +Degenerate output: NO +Multi-sample semantic score (n<=8): 0.0627 +Lock-in step (CER<=0.05): t=0 +Locked tokens: 75 Flexible tokens: 5 +TF-IDF vs attention stability corr: -0.0869 +TF-IDF status: WEAK + +Saved graphs: + - task2_attn_t*.png / task2_all_layers_t0.png + - task2_attn_evolution.png + - task2_semantic_drift.png + - task2_source_alignment.png + - task2_tfidf_vs_attention.png + +Step trajectory (first 10 rows) +------------------------------------------------------------ +t= 31 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 30 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 29 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 28 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 27 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 26 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 25 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 24 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 23 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ +t= 22 bert=0.0167 drift=0.9833 text=तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ तृ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_semantic_drift.png b/analysis_outputs/outputs_all_models_20260325/T32/task2_semantic_drift.png new file mode 100644 index 0000000000000000000000000000000000000000..05035b7f6b1d92d7662a54de768552b21782e301 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task2_semantic_drift.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_source_alignment.png b/analysis_outputs/outputs_all_models_20260325/T32/task2_source_alignment.png new file mode 100644 index 0000000000000000000000000000000000000000..2f88e5b0b525aa9c5f8b49d8d76eb34c8f352cfa Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task2_source_alignment.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task2_tfidf_vs_attention.png b/analysis_outputs/outputs_all_models_20260325/T32/task2_tfidf_vs_attention.png new file mode 100644 index 0000000000000000000000000000000000000000..325cde50a62791eb80fc6823d39f86ba6586dc10 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task2_tfidf_vs_attention.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task3_concept_space.png b/analysis_outputs/outputs_all_models_20260325/T32/task3_concept_space.png new file mode 100644 index 0000000000000000000000000000000000000000..d8de1f9e0f762b79c9b1c2035834c607aee341fc Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task3_concept_space.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task3_diversity_curve.png b/analysis_outputs/outputs_all_models_20260325/T32/task3_diversity_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..e39fa82a7a6ba379230307c75494d3ae0bf07bb1 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task3_diversity_curve.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task3_diversity_direction.npy b/analysis_outputs/outputs_all_models_20260325/T32/task3_diversity_direction.npy new file mode 100644 index 0000000000000000000000000000000000000000..40c4da2d569267441ed2dae1c58e0958acb38bc0 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task3_diversity_direction.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e547306c9469858deaa9985c30d31639c8a9f8104e8addd83afa88fa0264831 +size 4224 diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task3_pca_explained_variance.png b/analysis_outputs/outputs_all_models_20260325/T32/task3_pca_explained_variance.png new file mode 100644 index 0000000000000000000000000000000000000000..e15af89b198faabbc354a7f7549d40307280e13a Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task3_pca_explained_variance.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task3_report.txt b/analysis_outputs/outputs_all_models_20260325/T32/task3_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..354cb39ea907166b658d1d33682817fd2bf13127 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task3_report.txt @@ -0,0 +1,21 @@ +TASK 3 — CONCEPT VECTORS + PCA STEERING +================================================== + +PCA: 50 components, 94.6% variance +Diversity PC: 0 (|r|=-0.530 with diversity proxy) + +Direction validity: WEAK +Spectrum unique ratio (mean over 5 seeds): 0.840 +Spectrum semantic stability (mean over 5 seeds): 0.234 + +Saved graphs: + - task3_concept_space.png + - task3_pca_explained_variance.png + - task3_diversity_curve.png + +Diversity spectrum: + alpha=-2.0 → ेन श्रे श्रे ेन श्रे अण्ड व्याः श्रे तन्त्रा ॥ ॥ ॥ व्याः व्याः व्याः तद्वद् तद्वद् तद्वद् ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ तद्वद् ॥ ॥ ॥ ॥ ॥ व्याः व्याः व्याः ॥ ॥ राजन्य व्याः व्याः व्याः ॥ व्याः व्याः ॥ ॥ काम्य ॥ ॥ ॥ व्याः ॥ तद्वद् ॥ ॥ ॥ ॥ ॥ तन्त्रा तन्त्रा ॥ ॥ ॥ ॥ व्याः ॥ ॥ ॥ ॥ ॥ युधम् तद्वद् युधम् ॥ + alpha=-1.0 → श्रे श्रे श्रे ेन श्रे श्रे श्रे श्रे अण्ड तन्त्रा व्याः ॥ अण्ड अण्ड तन्त्रा व्याः तद्वद् ॥ व्याः ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ अण्ड ॥ ॥ ॥ व्याः ॥ व्याः नो̍ ॥ ॥ ॥ ॥ ॥ व्याः व्याः अण्ड ॥ ॥ तन्त्रा ॥ ॥ तद्वद् युधम् रोमा शम्भु ॥ धूमं तन्त्रा ॥ तन्त्रा ॥ व्याः ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ ॥ + alpha=+0.0 → अण्ड श्रे करः श्रे तन्त्रा करः करः तन्त्रा श्रे अण्ड अण्ड अण्ड ॥ श्रे तद्वद् अण्ड ॥ ॥ अण्ड ॥ ॥ ॥ ॥ ॥ ॥ ॥ अण्ड ॥ ॥ ॥ ॥ अण्ड ॥ ॥ ॥ ॥ ॥ ॥ राजन्य तन्त्रा नो̍ ॥ ॥ ॥ ॥ ॥ व्याः ॥ अण्ड ॥ काम्य ॥ ॥ ॥ ॥ ॥ शम्भु धूमं तन्त्रा तन्त्रा ेन ॥ काम्य ॥ ॥ करः तन्त्रा ॥ अण्ड ॥ अण्ड ॥ विनिर्जित्य ॥ ॥ ॥ तन्त्रा अण्ड तद्वद् करः + alpha=+1.0 → माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण + alpha=+2.0 → माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण माण diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task4_3d.png b/analysis_outputs/outputs_all_models_20260325/T32/task4_3d.png new file mode 100644 index 0000000000000000000000000000000000000000..a340d93759fc43c8c771d56ba3b8774acd109732 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T32/task4_3d.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task4_raw_results.json b/analysis_outputs/outputs_all_models_20260325/T32/task4_raw_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c533aafec897e5c8588e597e61c869840db75086 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task4_raw_results.json @@ -0,0 +1,8 @@ +{ + "32": { + "bertscore_f1": 0.04221478336089375, + "semantic_sim": 0.0011696306429548563, + "bleu": 3.0458312005937454e-233, + "speed_per_sample": 1.8451481468771818 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task4_report.txt b/analysis_outputs/outputs_all_models_20260325/T32/task4_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d513a1903d5e1abba5be3011675ff46f9a35638 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task4_report.txt @@ -0,0 +1,14 @@ +TASK 4 — SEMANTIC ROBUSTNESS ABLATION +================================================== + +Optimal diffusion steps = 32 + + T BERT-F1 SEM_SIM BLEU sec/sample + -------------------------------------------------------- + 32 0.0422 0.0012 0.0000 1.8451 + +Marginal gains (BERT-F1): + +Saved plots/files: + - task4_3d.png + - task4_raw_results.json diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task5_guidance_results.json b/analysis_outputs/outputs_all_models_20260325/T32/task5_guidance_results.json new file mode 100644 index 0000000000000000000000000000000000000000..febeb98750b1774aa4accb0efae9dcc08e7a905c --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task5_guidance_results.json @@ -0,0 +1,44 @@ +{ + "0.0": { + "mean_cer": 0.9357151936131439, + "diversity": 0.2390096905266583, + "sent_unique": 1.0, + "distinct2": 0.010759493670886076, + "self_bleu": 0.5327401126175695 + }, + "0.5": { + "mean_cer": 0.9372355582889396, + "diversity": 0.251484832665227, + "sent_unique": 0.95, + "distinct2": 0.014556962025316455, + "self_bleu": 0.5115872966948625 + }, + "1.0": { + "mean_cer": 0.9467289259059477, + "diversity": 0.16373703579151694, + "sent_unique": 0.7, + "distinct2": 0.017721518987341773, + "self_bleu": 0.6902474474043079 + }, + "1.5": { + "mean_cer": 0.9528026153008634, + "diversity": 0.13674192439776467, + "sent_unique": 0.55, + "distinct2": 0.016772151898734176, + "self_bleu": 0.7432883031032048 + }, + "2.0": { + "mean_cer": 0.952512615537643, + "diversity": 0.14404646175677124, + "sent_unique": 0.5, + "distinct2": 0.013291139240506329, + "self_bleu": 0.7251982157269639 + }, + "3.0": { + "mean_cer": 0.9495546075910856, + "diversity": 0.1806440794427056, + "sent_unique": 0.5, + "distinct2": 0.017721518987341773, + "self_bleu": 0.6564333601019305 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_classifier.pt b/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_classifier.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f9b2f1604a6ec778fcb0a2dcd6e9eef5099058a --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_classifier.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d7719ed8edad6b67d3c3ee1702a75f622889559a945f995d96a2b5c1b5eb6e7 +size 561505 diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_data.npz b/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_data.npz new file mode 100644 index 0000000000000000000000000000000000000000..597880910dc0c2bb2c1abd232cf70996c68af407 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_data.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6d269943c83d82deedce925f6298004b2edc3505adb631142379b8c62b467ff +size 164512 diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_diversity_tradeoff.png b/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_diversity_tradeoff.png new file mode 100644 index 0000000000000000000000000000000000000000..94ea577696bd1d8459be763ea7ede8e5c5afa97c --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task5_quality_diversity_tradeoff.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c02df97ebf86286c5f09aea37a7f468985bd656620313e3cb3efbff0d480ce16 +size 104732 diff --git a/analysis_outputs/outputs_all_models_20260325/T32/task5_report.txt b/analysis_outputs/outputs_all_models_20260325/T32/task5_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee947137db244c0fcb47bf64ec9a16431efbb77d --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T32/task5_report.txt @@ -0,0 +1,15 @@ +TASK 5 — CLASSIFIER-FREE GUIDANCE +================================================== + +Classifier params: 139521 +Training samples : 40 + +Guidance scale sweep: + λ CER diversity d2 sBLEU + ---------------------------------------------------- + 0.0 0.9357 0.239 0.011 0.533 ← optimal + 0.5 0.9372 0.251 0.015 0.512 + 1.0 0.9467 0.164 0.018 0.690 + 1.5 0.9528 0.137 0.017 0.743 + 2.0 0.9525 0.144 0.013 0.725 + 3.0 0.9496 0.181 0.018 0.656 diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task1_encoder_cost.png b/analysis_outputs/outputs_all_models_20260325/T4/task1_encoder_cost.png new file mode 100644 index 0000000000000000000000000000000000000000..16e8afb18a5bd226bd85ca3e2563ddfd56f0faf8 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task1_encoder_cost.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task1_kv_cache.txt b/analysis_outputs/outputs_all_models_20260325/T4/task1_kv_cache.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7067efc1d3d088f712f3d80d1feee438250c14b --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task1_kv_cache.txt @@ -0,0 +1,15 @@ +TASK 1 — KV CACHE BENCHMARK +======================================== + +has_generate_cached=True +memory_profile=Torch CPU mem-event reduction: 24.6% @ src_len=64 (std=525.8MB, cache=396.4MB) + + src_len standard(s) cached(s) speedup encoder% + 16 0.267 0.173 1.54x 43.2% + 32 0.197 0.153 1.29x 40.7% + 64 0.353 0.265 1.33x 42.0% + +Saved graphs: + - task1_time_comparison.png + - task1_speedup.png + - task1_encoder_cost.png diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task1_speedup.png b/analysis_outputs/outputs_all_models_20260325/T4/task1_speedup.png new file mode 100644 index 0000000000000000000000000000000000000000..40cfd1db848f00d701958637ce70408adb24e0cb Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task1_speedup.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task1_time_comparison.png b/analysis_outputs/outputs_all_models_20260325/T4/task1_time_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..5b239ae4c466365ec09122fd1c120db97b3c9b0d Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task1_time_comparison.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_all_layers_t0.png b/analysis_outputs/outputs_all_models_20260325/T4/task2_all_layers_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..93846e65493ce5d8e81a7ce0043017380cbb5516 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task2_all_layers_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_evolution.png b/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_evolution.png new file mode 100644 index 0000000000000000000000000000000000000000..a0ad60883dd1ca72c6e454f7cd9ea369f83379ed Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_evolution.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_t0.png b/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..8625e508c4328ee86916d0c0ce8590de5b28473f Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_t3.png b/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_t3.png new file mode 100644 index 0000000000000000000000000000000000000000..b7f548a696b3f668a7071436589f3f733029b7ba Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task2_attn_t3.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_report.txt b/analysis_outputs/outputs_all_models_20260325/T4/task2_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..ada36ec182c7758009b11f3e9690dc5ae32d0a34 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task2_report.txt @@ -0,0 +1,29 @@ +TASK 2 — ATTENTION + DRIFT REPORT +================================================== + +Input : dharmo rakṣati rakṣitaḥ +Output: धर्मो रक्षति रक्षितः + +Captured steps: 4 +Analysis quality: VALID +Final output uniq-ratio: 1.000 +Degenerate output: NO +Multi-sample semantic score (n<=8): 0.1568 +Lock-in step (CER<=0.05): t=0 +Locked tokens: 79 Flexible tokens: 1 +TF-IDF vs attention stability corr: 0.9472 +TF-IDF status: OK + +Saved graphs: + - task2_attn_t*.png / task2_all_layers_t0.png + - task2_attn_evolution.png + - task2_semantic_drift.png + - task2_source_alignment.png + - task2_tfidf_vs_attention.png + +Step trajectory (first 10 rows) +------------------------------------------------------------ +t= 3 bert=0.0603 drift=0.9397 text=ति ति ति रक्षि तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर्मो +t= 2 bert=0.0597 drift=0.9403 text=ति ति ति रक्षि तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर्मो +t= 1 bert=0.0597 drift=0.9403 text=ति ति ति रक्षि तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर्मो +t= 0 bert=0.0597 drift=0.9403 text=ति ति ति रक्षि तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर्मो diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_semantic_drift.png b/analysis_outputs/outputs_all_models_20260325/T4/task2_semantic_drift.png new file mode 100644 index 0000000000000000000000000000000000000000..176ce324d15eac456e83ebbdc120c8e49e9592ad Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task2_semantic_drift.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_source_alignment.png b/analysis_outputs/outputs_all_models_20260325/T4/task2_source_alignment.png new file mode 100644 index 0000000000000000000000000000000000000000..ea12443a4fa9e87473f94b4d99fed1098ee82749 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task2_source_alignment.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task2_tfidf_vs_attention.png b/analysis_outputs/outputs_all_models_20260325/T4/task2_tfidf_vs_attention.png new file mode 100644 index 0000000000000000000000000000000000000000..4fcd9fbd7871d9730538ac3bf8c02eaafa28974b Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task2_tfidf_vs_attention.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task3_concept_space.png b/analysis_outputs/outputs_all_models_20260325/T4/task3_concept_space.png new file mode 100644 index 0000000000000000000000000000000000000000..92d38005d3d520cd8815d795a86eca59d4af91df Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task3_concept_space.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task3_diversity_curve.png b/analysis_outputs/outputs_all_models_20260325/T4/task3_diversity_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..305ffd81ca67403818f32068c024e56d39998d38 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task3_diversity_curve.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task3_diversity_direction.npy b/analysis_outputs/outputs_all_models_20260325/T4/task3_diversity_direction.npy new file mode 100644 index 0000000000000000000000000000000000000000..c3b77f364d874d38c068f3d14c4f629d53844723 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task3_diversity_direction.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8295c0ab0d908a0069932ddfb8adcc398aa51e92f43be64447e9784fb44d332 +size 4224 diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task3_pca_explained_variance.png b/analysis_outputs/outputs_all_models_20260325/T4/task3_pca_explained_variance.png new file mode 100644 index 0000000000000000000000000000000000000000..63cf3b5e6cf9e60d506d5675abb05cfe2800e990 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task3_pca_explained_variance.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task3_report.txt b/analysis_outputs/outputs_all_models_20260325/T4/task3_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dd35ab36752cf49201756f7980823b775e8b424 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task3_report.txt @@ -0,0 +1,21 @@ +TASK 3 — CONCEPT VECTORS + PCA STEERING +================================================== + +PCA: 50 components, 72.0% variance +Diversity PC: 0 (|r|=-0.349 with diversity proxy) + +Direction validity: WEAK +Spectrum unique ratio (mean over 5 seeds): 1.000 +Spectrum semantic stability (mean over 5 seeds): 0.325 + +Saved graphs: + - task3_concept_space.png + - task3_pca_explained_variance.png + - task3_diversity_curve.png + +Diversity spectrum: + alpha=-2.0 → बले र् अपश्य येहि ऌ वीर्य ऌ सिंहा सन सन ̍त̱ ज्ज्वा माम् वै वै महर्द्धि महर्द्धि ऌ सिंहा कू दिक्षु ऌ दश्य वै क्रमं बले र् दश्य स्वस्थ तुल तुल वीर्य वीर्य वी ऌ सिंहा राज कू वीर्य वीर्य वीर्य वीर्य ऌ वी निरुद्धा ̍त̱ बले बले साध्व उपशान्त वी वी दाक्षि हतः महर्द्धि साध्व तु वी वी ऌ दिक्षु दिक्षु पूष माम् पुरं ऌ दिक्षु वी पूष ̍त̱ ोद् दिक्षु पुरं स्त्रं मनोरथ अस्मा ऌ वाहि राजान वी + alpha=-1.0 → बले बले अ तुल तुल वीर्य स्य सिंहा सन सन गतस्य गतस्य वै वै वै गतस्य सन पाता सिंहा दिता । ज्ज्वा वै वै बले बले र् अ अ तुल तुल वीर्य वीर्य स्य सिंहा सिंहा ध्रा स्य वीर्य वीर्य वीर्य तुल तुल ̍त̱ अ र् र् बले दिक्षु वै वै । वै संस्थिता रतं सन गतस्य पूष । वक्त्र सन सन सन सन सन व गतस्य व सन ॥ ति मनो हतः मातु ̍त̱ व कू कू सन सन + alpha=+0.0 → बले र् अ तुल तुल वीर्य स्य सिंहा सन सन गतस्य गतस्य वै वै वै गतस्य सन सन सिंहा सिंहा । व वै वै त्वम् बले र् अ अ तुल तुल वीर्य वीर्य स्य स्य स्य सिंहा स्य स्य वीर्य वीर्य तुल तुल तुल अ र् र् र् त्ते वै वै गतस्य सन सन सन सन सन गतस्य निःसृ गतस्य सन गतस्य सन सन सन सन सन वि सन वि स्रव सिंहा सन सन सन सन सन सन सन गतस्य + alpha=+1.0 → बले र् अ अ तुल वीर्य स्य सिंहा सन सन गतस्य गतस्य वै वै वै गतस्य सन सन सिंहा सिंहा षण् स्य ै वै बले बले र् अ अ तुल कान्ते षण् वीर्य स्य स्य सिंहा स्य स्य स्य वीर्य षण् वीर्य षण् अ अ र् र् र् षण् ेष गतस्य गतस्य गतस्य गतस्य सन सन षण् षण् गतस्य सन गतस्य गतस्य सन सन सन सन ष्णु गतस्य नो षण् नो - सन सन सन सन सन सन सन सन + alpha=+2.0 → षण् र् अ षण् षण् षण् स्य षण् षण् षण् षण् षण् वै षण् षण् गतस्य षण् षण् षण् षण् षण् षण् षण् स षण् षण् र् षण् अ षण् षण् षण् षण् स्य स्य षण् स्य स्य स्य षण् षण् षण् षण् षण् अ र् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् गतस्य षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् षण् diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task4_3d.png b/analysis_outputs/outputs_all_models_20260325/T4/task4_3d.png new file mode 100644 index 0000000000000000000000000000000000000000..c14e81d028f566e0860be4acc58fa49260a9e914 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task4_3d.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task4_raw_results.json b/analysis_outputs/outputs_all_models_20260325/T4/task4_raw_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8fe721106edb92be1e77debbaac2b09ff6c31dac --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task4_raw_results.json @@ -0,0 +1,8 @@ +{ + "4": { + "bertscore_f1": 0.2643599230400282, + "semantic_sim": 0.05738751729553948, + "bleu": 4.535508724717073e-80, + "speed_per_sample": 0.27822498957684727 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task4_report.txt b/analysis_outputs/outputs_all_models_20260325/T4/task4_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..e02dc24ad737cd05321f86034a3c24bbce49e4ef --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task4_report.txt @@ -0,0 +1,14 @@ +TASK 4 — SEMANTIC ROBUSTNESS ABLATION +================================================== + +Optimal diffusion steps = 4 + + T BERT-F1 SEM_SIM BLEU sec/sample + -------------------------------------------------------- + 4 0.2644 0.0574 0.0000 0.2782 + +Marginal gains (BERT-F1): + +Saved plots/files: + - task4_3d.png + - task4_raw_results.json diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task5_guidance_results.json b/analysis_outputs/outputs_all_models_20260325/T4/task5_guidance_results.json new file mode 100644 index 0000000000000000000000000000000000000000..dabf71e3315c81165e1eef27b7e017775c379237 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task5_guidance_results.json @@ -0,0 +1,44 @@ +{ + "0.0": { + "mean_cer": 0.8366061504793759, + "diversity": 0.8152941244227976, + "sent_unique": 1.0, + "distinct2": 0.6351265822784811, + "self_bleu": 0.0045383334328860155 + }, + "0.5": { + "mean_cer": 0.8356280133439636, + "diversity": 0.7972091041380793, + "sent_unique": 1.0, + "distinct2": 0.5987341772151898, + "self_bleu": 0.004315968939031317 + }, + "1.0": { + "mean_cer": 0.8368554238287981, + "diversity": 0.7911742292217158, + "sent_unique": 1.0, + "distinct2": 0.5882911392405064, + "self_bleu": 0.005942680797074879 + }, + "1.5": { + "mean_cer": 0.8366626550229774, + "diversity": 0.7826869545397406, + "sent_unique": 1.0, + "distinct2": 0.5708860759493671, + "self_bleu": 0.005512166869886022 + }, + "2.0": { + "mean_cer": 0.836666396481504, + "diversity": 0.7739285035209875, + "sent_unique": 1.0, + "distinct2": 0.5531645569620253, + "self_bleu": 0.005307549920050191 + }, + "3.0": { + "mean_cer": 0.8362896513976861, + "diversity": 0.7685362591015914, + "sent_unique": 1.0, + "distinct2": 0.5424050632911392, + "self_bleu": 0.005332545087956551 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_classifier.pt b/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_classifier.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbe876fd06482d83b060a851cb7d597543277b9d --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_classifier.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c2c1e90bc5c9611bbedf00b8e3380cfc32e1f09c168fc591e866aca155bb961 +size 561505 diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_data.npz b/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_data.npz new file mode 100644 index 0000000000000000000000000000000000000000..512fa8d2e3283455520ee0a70d62a8224d690206 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_data.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a926559d00760bf640e998791216a6bb7f40f7dd4be70545057435f5f657765e +size 164512 diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_diversity_tradeoff.png b/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_diversity_tradeoff.png new file mode 100644 index 0000000000000000000000000000000000000000..a343fef19b7a4ccee01b138d963b755cd90005a2 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T4/task5_quality_diversity_tradeoff.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T4/task5_report.txt b/analysis_outputs/outputs_all_models_20260325/T4/task5_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8cb8f40f764d4df5677bdc393d23c1863c89e88 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T4/task5_report.txt @@ -0,0 +1,15 @@ +TASK 5 — CLASSIFIER-FREE GUIDANCE +================================================== + +Classifier params: 139521 +Training samples : 40 + +Guidance scale sweep: + λ CER diversity d2 sBLEU + ---------------------------------------------------- + 0.0 0.8366 0.815 0.635 0.005 + 0.5 0.8356 0.797 0.599 0.004 ← optimal + 1.0 0.8369 0.791 0.588 0.006 + 1.5 0.8367 0.783 0.571 0.006 + 2.0 0.8367 0.774 0.553 0.005 + 3.0 0.8363 0.769 0.542 0.005 diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task1_encoder_cost.png b/analysis_outputs/outputs_all_models_20260325/T64/task1_encoder_cost.png new file mode 100644 index 0000000000000000000000000000000000000000..4e5df1cced8a28015561b1d83b190e893d355172 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task1_encoder_cost.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task1_kv_cache.txt b/analysis_outputs/outputs_all_models_20260325/T64/task1_kv_cache.txt new file mode 100644 index 0000000000000000000000000000000000000000..547d6976e52f48cbfe69ec8088bd6497acb90391 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task1_kv_cache.txt @@ -0,0 +1,15 @@ +TASK 1 — KV CACHE BENCHMARK +======================================== + +has_generate_cached=True +memory_profile=Torch CPU mem-event reduction: 31.4% @ src_len=64 (std=8592.3MB, cache=5890.5MB) + + src_len standard(s) cached(s) speedup encoder% + 16 4.206 3.584 1.17x 74.4% + 32 4.647 3.371 1.38x 37.6% + 64 8.403 4.593 1.83x 49.6% + +Saved graphs: + - task1_time_comparison.png + - task1_speedup.png + - task1_encoder_cost.png diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task1_speedup.png b/analysis_outputs/outputs_all_models_20260325/T64/task1_speedup.png new file mode 100644 index 0000000000000000000000000000000000000000..f6c4419ab7cd1f9e1835cbdf0e5501f8b81b77fe Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task1_speedup.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task1_time_comparison.png b/analysis_outputs/outputs_all_models_20260325/T64/task1_time_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..38a666f4ee9c5a79ce50efe842227f93cf1159d6 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task1_time_comparison.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_all_layers_t0.png b/analysis_outputs/outputs_all_models_20260325/T64/task2_all_layers_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..aec8afac6f920cc3c080eb92b38c1b91cf0a050b Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task2_all_layers_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_evolution.png b/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_evolution.png new file mode 100644 index 0000000000000000000000000000000000000000..1b27a8e2e2917c1a5282a0b30712d3a27492109d Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_evolution.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_t0.png b/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..c260b1cad10fc59684af22af29490c5b9ae0d9ea Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_t63.png b/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_t63.png new file mode 100644 index 0000000000000000000000000000000000000000..504bc5fbd6577ba196d5092cd08d28153486c7d7 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task2_attn_t63.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_report.txt b/analysis_outputs/outputs_all_models_20260325/T64/task2_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0c71354dce847a284b3a414c2391a59feefb03e --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task2_report.txt @@ -0,0 +1,35 @@ +TASK 2 — ATTENTION + DRIFT REPORT +================================================== + +Input : dharmo rakṣati rakṣitaḥ +Output: धर्मो रक्षति रक्षितः + +Captured steps: 64 +Analysis quality: VALID +Final output uniq-ratio: 1.000 +Degenerate output: NO +Multi-sample semantic score (n<=8): 0.1490 +Lock-in step (CER<=0.05): t=0 +Locked tokens: 59 Flexible tokens: 21 +TF-IDF vs attention stability corr: 0.7804 +TF-IDF status: OK + +Saved graphs: + - task2_attn_t*.png / task2_all_layers_t0.png + - task2_attn_evolution.png + - task2_semantic_drift.png + - task2_source_alignment.png + - task2_tfidf_vs_attention.png + +Step trajectory (first 10 rows) +------------------------------------------------------------ +t= 63 bert=0.0552 drift=0.9448 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 62 bert=0.0548 drift=0.9452 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 61 bert=0.0548 drift=0.9452 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 60 bert=0.0548 drift=0.9452 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 59 bert=0.0548 drift=0.9452 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 58 bert=0.0548 drift=0.9452 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 57 bert=0.0548 drift=0.9452 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 56 bert=0.0546 drift=0.9454 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 55 bert=0.0546 drift=0.9454 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् +t= 54 bert=0.0546 drift=0.9454 text=धर्मो ति काम्य तः तः तः तः तः तः धर्मो धर्मो धर्मो धर्मो धर् diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_semantic_drift.png b/analysis_outputs/outputs_all_models_20260325/T64/task2_semantic_drift.png new file mode 100644 index 0000000000000000000000000000000000000000..544662a38781ab84d982eac2784204a8be3c27cb Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task2_semantic_drift.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_source_alignment.png b/analysis_outputs/outputs_all_models_20260325/T64/task2_source_alignment.png new file mode 100644 index 0000000000000000000000000000000000000000..fc5296c294f8013e9fac2e4e62d69712202f85ab Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task2_source_alignment.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task2_tfidf_vs_attention.png b/analysis_outputs/outputs_all_models_20260325/T64/task2_tfidf_vs_attention.png new file mode 100644 index 0000000000000000000000000000000000000000..76b1daba96d7e70644be1f89a46eeac91bb08953 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task2_tfidf_vs_attention.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task3_concept_space.png b/analysis_outputs/outputs_all_models_20260325/T64/task3_concept_space.png new file mode 100644 index 0000000000000000000000000000000000000000..a4718b3a43d7392f965d931b867dabab083b8cec Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task3_concept_space.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task3_diversity_curve.png b/analysis_outputs/outputs_all_models_20260325/T64/task3_diversity_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..ab08a6b2c1e5a50ef7bb8664d23d6a6e6a5f3ab1 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task3_diversity_curve.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task3_diversity_direction.npy b/analysis_outputs/outputs_all_models_20260325/T64/task3_diversity_direction.npy new file mode 100644 index 0000000000000000000000000000000000000000..c07cc6f708e7d0bde86c63cda93c1fd4cae6c456 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task3_diversity_direction.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:995c77610a2fdc3c333925927723d93a2b0802e1d127fcb3562ca1ac1273b4c9 +size 4224 diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task3_pca_explained_variance.png b/analysis_outputs/outputs_all_models_20260325/T64/task3_pca_explained_variance.png new file mode 100644 index 0000000000000000000000000000000000000000..6bd11639db65b2b031650bc1ef5d9469a434289b Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task3_pca_explained_variance.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task3_report.txt b/analysis_outputs/outputs_all_models_20260325/T64/task3_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1ab682bcebc6ee2b81c25dc0c1ca69510d9865a --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task3_report.txt @@ -0,0 +1,21 @@ +TASK 3 — CONCEPT VECTORS + PCA STEERING +================================================== + +PCA: 39 components, 100.0% variance +Diversity PC: 0 (|r|=0.314 with diversity proxy) + +Direction validity: WEAK +Spectrum unique ratio (mean over 5 seeds): 1.000 +Spectrum semantic stability (mean over 5 seeds): 0.302 + +Saved graphs: + - task3_concept_space.png + - task3_pca_explained_variance.png + - task3_diversity_curve.png + +Diversity spectrum: + alpha=-2.0 → बले बले अ तुल तुल वीर्य अ̱स्या भूयः सन ान्ते लब्ध्वा अर्थ वै भूयः त्ति ान्त सन भूयः भूयः तुल अ अ वीरो बले बले बले अ̱स्या भूयः ॥ ॥ अ अ अ̱स्या ॥ वे̱ध सन सन सन सन ह् ह् ान ह् स्य ह् ानु ह् यो बो दादि ह् मतां ह् सान्त्व ह् ( मतां ॥ धीमान् भूयः ॥ अ̱स्या पान होमयेत् सारथि ( ॥ भूयः ॥ ॥ ॥ ॥ ॥ ॥ गोप लज्ज अ̱स्या मतां लज्ज यो + alpha=-1.0 → बले बले अ तुल तुल वीर्य स्य सिंहा सिंहा सन त्ति वै वै वै द् सन सिंहा स्य स्य तुल तुल अ प्रयाति र् बले बले बले ॥ ॥ ॥ अ ॥ पि महा सन सन सन सन सन सन सिंहा स्रव सन स्य मीं गोप स्य स्य स्य स्य भूयः तुल सान्त्व यो अ ह् ान ान तव वेग ( यो भूषणम् ( ानु ॥ ॥ ॥ ॥ ॥ ॥ अ̱स्या ॥ ॥ ॥ ॥ यो पि ॥ म + alpha=+0.0 → बले र् अ तुल तुल वीर्य स्य सिंहा सिंहा सन ध्या वै वै वै गतस्य भ सिंहा भ स्य तुल तुल अ अ र् बले बले बले ॥ बले र् । । ॥ वै वै सन सन सन सन सिंहा सिंहा सिंहा स्य स्य स्य स्य भ स्य स्य स्य स्य ानु स्य ् ता यो स्य फल ॥ म तुल च सि ॥ ् ॥ ॥ न् ॥ ॥ ॥ ॥ ॥ ॥ ॥ महा सन सन क्ष ॥ + alpha=+1.0 → बले र् । तुल यु वीर्य स्य सिंहा सिंहा सन ध्या वै वै वै । सन सिंहा स्य स्य तुल तुल । र् र् बले बले बले ॥ ॥ र् अ स् सन ते सन ीं सन सन त्र सिंहा यु सिंहा स्थल स्य स्य रौद्र स्य स्य न्दा ता यु स्य यु त्र क्ष ।। ीं स्य म्र कल्प यत् स् क्ष क्ष ॥ स्य यु मण्डलं यु ॥ ॥ ीं ॥ ॥ भ्यः ीं ीं ॥ ॥ ॥ + alpha=+2.0 → र् र् तुरङ्ग आहुः ितो । स्य सिंहा सिंहा सिंहा ब्रह्मा वै वै & ते तस् तुरङ्ग नो स्तम्भ ीं यु संच र् र् बले ीं स्तम्भ ते । तस् न्तं मण्डलं यु । स्तम्भ स्तम्भ सन आहुः सिंहा यु सिंहा सिंहा स्य मण्डलं यु स्य स्य स्य एव कल्प स्तम्भ ̱र स्तम्भ अमु आहुः यु ̱र कल्प यु तुरङ्ग यु तुरङ्ग ̱र तुरङ्ग रणम् मण्डलं यु ीं मण्डलं दिनं ̱र यु ॥ तुरङ्ग ितः आहुः ॥ मण्डलं आहुः क्षमः diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task4_3d.png b/analysis_outputs/outputs_all_models_20260325/T64/task4_3d.png new file mode 100644 index 0000000000000000000000000000000000000000..e6919dde5ab42d5308c6e9c82c23fa0c719114f4 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T64/task4_3d.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task4_raw_results.json b/analysis_outputs/outputs_all_models_20260325/T64/task4_raw_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8d23ab27ea7cdcbf28dfa9c03a91bfed74de8fc6 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task4_raw_results.json @@ -0,0 +1,8 @@ +{ + "64": { + "bertscore_f1": 0.24815431178218977, + "semantic_sim": 0.05796156618536994, + "bleu": 0.0007454091523007641, + "speed_per_sample": 5.611586072924547 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task4_report.txt b/analysis_outputs/outputs_all_models_20260325/T64/task4_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f2255b36929ed737d698fd21eb88c53d313f3dd --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task4_report.txt @@ -0,0 +1,14 @@ +TASK 4 — SEMANTIC ROBUSTNESS ABLATION +================================================== + +Optimal diffusion steps = 64 + + T BERT-F1 SEM_SIM BLEU sec/sample + -------------------------------------------------------- + 64 0.2482 0.0580 0.0007 5.6116 + +Marginal gains (BERT-F1): + +Saved plots/files: + - task4_3d.png + - task4_raw_results.json diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task5_guidance_results.json b/analysis_outputs/outputs_all_models_20260325/T64/task5_guidance_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a233e48c1995d01aceb8dc2ce52b61084254f600 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task5_guidance_results.json @@ -0,0 +1,44 @@ +{ + "0.0": { + "mean_cer": 0.8451135808531433, + "diversity": 0.8376039632633996, + "sent_unique": 1.0, + "distinct2": 0.6886075949367089, + "self_bleu": 0.01339966840990961 + }, + "0.5": { + "mean_cer": 0.8489806702732487, + "diversity": 0.8184169315895162, + "sent_unique": 1.0, + "distinct2": 0.65, + "self_bleu": 0.013166136820967567 + }, + "1.0": { + "mean_cer": 0.8508542801599923, + "diversity": 0.8382362515714494, + "sent_unique": 1.0, + "distinct2": 0.6835443037974683, + "self_bleu": 0.00707180065456945 + }, + "1.5": { + "mean_cer": 0.8622135155138807, + "diversity": 0.8573519831910085, + "sent_unique": 1.0, + "distinct2": 0.7196202531645569, + "self_bleu": 0.004916286782540023 + }, + "2.0": { + "mean_cer": 0.8760727953537162, + "diversity": 0.8694064256341608, + "sent_unique": 1.0, + "distinct2": 0.7436708860759493, + "self_bleu": 0.004858034807627716 + }, + "3.0": { + "mean_cer": 0.9056323790932513, + "diversity": 0.8141376456127002, + "sent_unique": 1.0, + "distinct2": 0.6417721518987342, + "self_bleu": 0.013496860673333779 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_classifier.pt b/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_classifier.pt new file mode 100644 index 0000000000000000000000000000000000000000..95786624d9b3670dc916e05b65a6153adc6b4ec9 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_classifier.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:721653a85658125d34a1e057b28187b76adbc01fd96f462b28a2355db569344b +size 561505 diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_data.npz b/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_data.npz new file mode 100644 index 0000000000000000000000000000000000000000..137a6c38e03a452592aef77e5d837a8b85cf05fa --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_data.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe93d8027fc1e162cbc3febccae193d81850f12648f0881a0be2f3d904e73a18 +size 82512 diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_diversity_tradeoff.png b/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_diversity_tradeoff.png new file mode 100644 index 0000000000000000000000000000000000000000..38c0c76b105ff37fedb7e2c3c6c88e26737ff017 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task5_quality_diversity_tradeoff.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce435c9204d449b43df70d1c85ef0918fae33f189fcecaadc2ac0b19ee602ec3 +size 102274 diff --git a/analysis_outputs/outputs_all_models_20260325/T64/task5_report.txt b/analysis_outputs/outputs_all_models_20260325/T64/task5_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..b702deb1d0c8f4111922dcea757bfb191c9e30f3 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T64/task5_report.txt @@ -0,0 +1,15 @@ +TASK 5 — CLASSIFIER-FREE GUIDANCE +================================================== + +Classifier params: 139521 +Training samples : 20 + +Guidance scale sweep: + λ CER diversity d2 sBLEU + ---------------------------------------------------- + 0.0 0.8451 0.838 0.689 0.013 ← optimal + 0.5 0.8490 0.818 0.650 0.013 + 1.0 0.8509 0.838 0.684 0.007 + 1.5 0.8622 0.857 0.720 0.005 + 2.0 0.8761 0.869 0.744 0.005 + 3.0 0.9056 0.814 0.642 0.013 diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task1_encoder_cost.png b/analysis_outputs/outputs_all_models_20260325/T8/task1_encoder_cost.png new file mode 100644 index 0000000000000000000000000000000000000000..d06596a9248273920b7eb674728c3cc12e5a54fa Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task1_encoder_cost.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task1_kv_cache.txt b/analysis_outputs/outputs_all_models_20260325/T8/task1_kv_cache.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3f54a309b0cdf092061e282217f8e8383ee9865 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task1_kv_cache.txt @@ -0,0 +1,15 @@ +TASK 1 — KV CACHE BENCHMARK +======================================== + +has_generate_cached=True +memory_profile=Torch CPU mem-event reduction: 25.8% @ src_len=64 (std=1168.9MB, cache=866.9MB) + + src_len standard(s) cached(s) speedup encoder% + 16 0.582 0.400 1.45x 35.9% + 32 0.511 0.402 1.27x 37.7% + 64 0.666 0.490 1.36x 35.6% + +Saved graphs: + - task1_time_comparison.png + - task1_speedup.png + - task1_encoder_cost.png diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task1_speedup.png b/analysis_outputs/outputs_all_models_20260325/T8/task1_speedup.png new file mode 100644 index 0000000000000000000000000000000000000000..23d45c962353da4f292a80bbf13d53cee6c7676a Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task1_speedup.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task1_time_comparison.png b/analysis_outputs/outputs_all_models_20260325/T8/task1_time_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..d778385452d31e7eccc5f91f1b306c19da4fb4a6 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task1_time_comparison.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_all_layers_t0.png b/analysis_outputs/outputs_all_models_20260325/T8/task2_all_layers_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..4c28069c573bc1d249dc56eea681a5fc7e07282c Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task2_all_layers_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_evolution.png b/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_evolution.png new file mode 100644 index 0000000000000000000000000000000000000000..e3329749592e9034979d128e1672fed9f6c6669d Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_evolution.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_t0.png b/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..27a3bacd21b3ae02281fe2263d8fadc121bb19b6 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_t7.png b/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_t7.png new file mode 100644 index 0000000000000000000000000000000000000000..13cd58b29e9aaef6957a054b29e3a4122c3bb9a0 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task2_attn_t7.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_report.txt b/analysis_outputs/outputs_all_models_20260325/T8/task2_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..1066ac6fe2c5f3481f89154d57c19bcf17da6094 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task2_report.txt @@ -0,0 +1,33 @@ +TASK 2 — ATTENTION + DRIFT REPORT +================================================== + +Input : dharmo rakṣati rakṣitaḥ +Output: धर्मो रक्षति रक्षितः + +Captured steps: 8 +Analysis quality: WEAK +Final output uniq-ratio: 1.000 +Degenerate output: NO +Multi-sample semantic score (n<=8): 0.0915 +Lock-in step (CER<=0.05): t=0 +Locked tokens: 79 Flexible tokens: 1 +TF-IDF vs attention stability corr: 0.8905 +TF-IDF status: OK + +Saved graphs: + - task2_attn_t*.png / task2_all_layers_t0.png + - task2_attn_evolution.png + - task2_semantic_drift.png + - task2_source_alignment.png + - task2_tfidf_vs_attention.png + +Step trajectory (first 10 rows) +------------------------------------------------------------ +t= 7 bert=0.0219 drift=0.9781 text=ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः +t= 6 bert=0.0225 drift=0.9775 text=ं ं ं ं ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः +t= 5 bert=0.0225 drift=0.9775 text=ं ं ं ं ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः +t= 4 bert=0.0225 drift=0.9775 text=ं ं ं ं ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः +t= 3 bert=0.0225 drift=0.9775 text=ं ं ं ं ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः +t= 2 bert=0.0227 drift=0.9773 text=ं ं ं ं ं ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ित +t= 1 bert=0.0228 drift=0.9772 text=ं ं ं ं ं ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ित +t= 0 bert=0.0228 drift=0.9772 text=ं ं ं ं ं ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ितः ित diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_semantic_drift.png b/analysis_outputs/outputs_all_models_20260325/T8/task2_semantic_drift.png new file mode 100644 index 0000000000000000000000000000000000000000..0631e5358e2e02db9b41ab62ca34fa20c10113f9 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task2_semantic_drift.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_source_alignment.png b/analysis_outputs/outputs_all_models_20260325/T8/task2_source_alignment.png new file mode 100644 index 0000000000000000000000000000000000000000..435b2fd33daad226d186d790460cb9fb0c21d697 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task2_source_alignment.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task2_tfidf_vs_attention.png b/analysis_outputs/outputs_all_models_20260325/T8/task2_tfidf_vs_attention.png new file mode 100644 index 0000000000000000000000000000000000000000..11c1119839cdd79495517ef163d429035fa21080 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task2_tfidf_vs_attention.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task3_concept_space.png b/analysis_outputs/outputs_all_models_20260325/T8/task3_concept_space.png new file mode 100644 index 0000000000000000000000000000000000000000..cf4a49b7f94dc2eaf6e3756a952250550ff71c97 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task3_concept_space.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task3_diversity_curve.png b/analysis_outputs/outputs_all_models_20260325/T8/task3_diversity_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..5b7561dc83e295548c8731c989310c90ff176926 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task3_diversity_curve.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task3_diversity_direction.npy b/analysis_outputs/outputs_all_models_20260325/T8/task3_diversity_direction.npy new file mode 100644 index 0000000000000000000000000000000000000000..348e1caa9078fdb34ae0829dfb4043586f35b1d1 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task3_diversity_direction.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bf1e0890ccaf9d8d961e2fa23564dacd7ec657ea91ad4ae630cfcb22bd48278 +size 4224 diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task3_pca_explained_variance.png b/analysis_outputs/outputs_all_models_20260325/T8/task3_pca_explained_variance.png new file mode 100644 index 0000000000000000000000000000000000000000..e8d0c28a8e003fecd7a17a0491e3e8a877ea13fc Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task3_pca_explained_variance.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task3_report.txt b/analysis_outputs/outputs_all_models_20260325/T8/task3_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..b35aec8a45bfc63d89d3d235bd8478ce394dff61 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task3_report.txt @@ -0,0 +1,21 @@ +TASK 3 — CONCEPT VECTORS + PCA STEERING +================================================== + +PCA: 50 components, 75.9% variance +Diversity PC: 0 (|r|=-0.344 with diversity proxy) + +Direction validity: WEAK +Spectrum unique ratio (mean over 5 seeds): 1.000 +Spectrum semantic stability (mean over 5 seeds): 0.341 + +Saved graphs: + - task3_concept_space.png + - task3_pca_explained_variance.png + - task3_diversity_curve.png + +Diversity spectrum: + alpha=-2.0 → मनसः श्चक्र स्य स्य स्य अ स्य तैः तैः तैः स्य श्चक्र तैः गतभी स्य स्य स्य श्चक्र तैः तैः श्चक्र स्य तैः स्त्वं श्चक्र श्चक्र स्त्र तैः तैः कुण्ठ तैः तैः स्य तैः तैः तैः स्य तैः तैः गतभी तैः तैः णि̍ स्य तैः तैः तैः अ तैः तैः ह्वये मनसः ॥ तैः तैः गतभी ॥ श्चक्र तैः तैः तैः तैः तैः तैः तैः तैः स्य तैः करिष्या तैः स्त्वं तैः तैः श्चक्र तैः तैः श्चक्र तैः तैः ह्वये + alpha=-1.0 → स्य अ तैः वै तैः अ वेद् मनसः स्य । । तैः तैः स्य गतभी स्य अ स्त्वं सीद् स्य तैः स्य तैः सु̱म् सीद् र्ध कृतानि गतभी गतभी तैः तैः स्य तैः तैः तैः मनसः तैः कृतानि तैः तैः सु̱म् अ तैः मनसः अ मनसः स्य अ तैः ॥ ॥ स्य गतभी गतभी ॥ ॥ वै तैः तैः मनसः तैः अ तैः तैः च वर स्य तैः या वात् स्य तैः सीद् तैः स्य तैः स्य अ तैः तैः + alpha=+0.0 → अ अ वै ज्ञ स्य अ ज्ञ गतभी वर द शिख मन्त्र गतभी सु̱म् । द द स्य मन्त्र वा यो सीद् ज्ञ वै अ स्य स्य मन्त्र स्य मन्त्र स्य गतभी । गतभी गतभी तैः गतभी कृत तैः स्य तैः ॥ वै तैः ॥ वै अ कृतानि स्य वर वै ॥ ॥ वै ॥ अ ॥ स्य ॥ वै स्य ज्ञ ॥ स्य तैः तैः वै स्य स्य अ स्य तैः वै स्य तैः प्रण तीरे स्य । सीद् + alpha=+1.0 → पम वै तुल्य शत्रू पम शिख वर अ णाः परा णाः स्य कृत प्रिय । भिन् णाः ज्ञ वै विराज वै गणो वै ्या अ वै पम ्या भिन् वै लब्ध शोभ स्य च श वर वै ॥ वै क्षिप्य शिख भिर् ॥ सन वा मन्त्र मृ ॥ ॥ ॥ वै ॥ मन्त्र ॥ पम सङ् वर शोभ क्षिप्य भिर् स्य क्षिप्य वै सन वर शिख वै शिख वर दर्श शिख कलं पम ौ वर कलं भिर् कलं वै शिख + alpha=+2.0 → पम पम लब्ध पम शोभ पम परे भिर् अन्य णाः रसा लब्ध पम शोभ लब्ध शोभ पम शत्रू शिख भिन् पम पम पम णाः शोभ णाः पम शोभ शोभ परे णाः णाः पम पम पम पम शोभ पम शोभ शोभ पम डा णाः शोभ वै पम वै ॥ पम ॥ पम णाः ॥ ॥ परे अन्य णाः पम शोभ शोभ शोभ शोभ णाः वै परे कलं परे वै पम णाः पम शोभ णाः णाः कलं परे शोभ णाः कलं शत्रू diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task4_3d.png b/analysis_outputs/outputs_all_models_20260325/T8/task4_3d.png new file mode 100644 index 0000000000000000000000000000000000000000..528eb74d2c0c85f85669fc5c943b00350d988092 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task4_3d.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task4_raw_results.json b/analysis_outputs/outputs_all_models_20260325/T8/task4_raw_results.json new file mode 100644 index 0000000000000000000000000000000000000000..faa6341beebd60406878f0e2b8e851c4df4d37ca --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task4_raw_results.json @@ -0,0 +1,8 @@ +{ + "8": { + "bertscore_f1": 0.12100082606041354, + "semantic_sim": 0.0400022830756336, + "bleu": 2.82720195540198e-156, + "speed_per_sample": 0.6194231500005116 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task4_report.txt b/analysis_outputs/outputs_all_models_20260325/T8/task4_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..55cc2ac72386104bec707a17cbd9bed97ac9fa6c --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task4_report.txt @@ -0,0 +1,14 @@ +TASK 4 — SEMANTIC ROBUSTNESS ABLATION +================================================== + +Optimal diffusion steps = 8 + + T BERT-F1 SEM_SIM BLEU sec/sample + -------------------------------------------------------- + 8 0.1210 0.0400 0.0000 0.6194 + +Marginal gains (BERT-F1): + +Saved plots/files: + - task4_3d.png + - task4_raw_results.json diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task5_guidance_results.json b/analysis_outputs/outputs_all_models_20260325/T8/task5_guidance_results.json new file mode 100644 index 0000000000000000000000000000000000000000..26d026eb2fdb05cecc095723ba7ef5e014111412 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task5_guidance_results.json @@ -0,0 +1,44 @@ +{ + "0.0": { + "mean_cer": 0.8834048320040591, + "diversity": 0.7959807396207479, + "sent_unique": 1.0, + "distinct2": 0.5962025316455696, + "self_bleu": 0.0042410524040737176 + }, + "0.5": { + "mean_cer": 0.8880994023511585, + "diversity": 0.7813082684105759, + "sent_unique": 1.0, + "distinct2": 0.5680379746835443, + "self_bleu": 0.005421437862392686 + }, + "1.0": { + "mean_cer": 0.8876164305728409, + "diversity": 0.7668309841779627, + "sent_unique": 1.0, + "distinct2": 0.5401898734177215, + "self_bleu": 0.0065279050617961685 + }, + "1.5": { + "mean_cer": 0.892132064260031, + "diversity": 0.7565293273801122, + "sent_unique": 1.0, + "distinct2": 0.5174050632911392, + "self_bleu": 0.0043464085309147735 + }, + "2.0": { + "mean_cer": 0.8929310078629407, + "diversity": 0.7341781360646314, + "sent_unique": 1.0, + "distinct2": 0.4740506329113924, + "self_bleu": 0.005694360782129518 + }, + "3.0": { + "mean_cer": 0.896964696872683, + "diversity": 0.7240438326365156, + "sent_unique": 1.0, + "distinct2": 0.4534810126582278, + "self_bleu": 0.005393347385196553 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_classifier.pt b/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_classifier.pt new file mode 100644 index 0000000000000000000000000000000000000000..90a8e600bd80806c66cabbb2fbcac794362717e3 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_classifier.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7c94f9d8272a570fe00c17ee3201aea4dd69d2e71e22021059cf62d4f87c09 +size 561505 diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_data.npz b/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_data.npz new file mode 100644 index 0000000000000000000000000000000000000000..bdbbbdad2ab4269f031b23d213fe034b6aeb3d23 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_data.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:823c05842e84f3758b5cdbd2c8e35a443cbf73b369251555aa88de8bd4aa1a6c +size 164512 diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_diversity_tradeoff.png b/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_diversity_tradeoff.png new file mode 100644 index 0000000000000000000000000000000000000000..22633ce4bf5848aacacf36d2a9eccc4e688de7b0 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/T8/task5_quality_diversity_tradeoff.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/T8/task5_report.txt b/analysis_outputs/outputs_all_models_20260325/T8/task5_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c54b674afed1a91a7d77cabf093aab36b8a225e --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/T8/task5_report.txt @@ -0,0 +1,15 @@ +TASK 5 — CLASSIFIER-FREE GUIDANCE +================================================== + +Classifier params: 139521 +Training samples : 40 + +Guidance scale sweep: + λ CER diversity d2 sBLEU + ---------------------------------------------------- + 0.0 0.8834 0.796 0.596 0.004 ← optimal + 0.5 0.8881 0.781 0.568 0.005 + 1.0 0.8876 0.767 0.540 0.007 + 1.5 0.8921 0.757 0.517 0.004 + 2.0 0.8929 0.734 0.474 0.006 + 3.0 0.8970 0.724 0.453 0.005 diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_encoder_cost.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_encoder_cost.png new file mode 100644 index 0000000000000000000000000000000000000000..bdbc45a493fdb316f653fe848973c9ff101d05af Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_encoder_cost.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_kv_cache.txt b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_kv_cache.txt new file mode 100644 index 0000000000000000000000000000000000000000..73e939b9bb6b15b1cbbdf786d4d9fee6a96786fc --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_kv_cache.txt @@ -0,0 +1,15 @@ +TASK 1 — KV CACHE BENCHMARK +======================================== + +has_generate_cached=True +memory_profile=Torch CPU mem-event reduction: 22.4% @ src_len=64 (std=577.8MB, cache=448.4MB) + + src_len standard(s) cached(s) speedup encoder% + 16 0.469 0.336 1.40x 37.2% + 32 0.334 0.247 1.35x 33.6% + 64 0.445 0.755 0.59x 33.4% + +Saved graphs: + - task1_time_comparison.png + - task1_speedup.png + - task1_encoder_cost.png diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_speedup.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_speedup.png new file mode 100644 index 0000000000000000000000000000000000000000..d2fe77f03a746d50242464e98c0257a540872a98 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_speedup.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_time_comparison.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_time_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..f16c756898fb09ecc33c7c25ffe0cbc9d9c3e473 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task1_time_comparison.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_all_layers_t0.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_all_layers_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..59f9cb38cb7d4c1bd8b8c83a73e7892d3cb1e903 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_all_layers_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_evolution.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_evolution.png new file mode 100644 index 0000000000000000000000000000000000000000..695b4823060b488eac61eb0d027f99f14a7391af Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_evolution.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t0 copy.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t0 copy.png new file mode 100644 index 0000000000000000000000000000000000000000..7414c27a3ee624a7a6f9d3d44afaa57caac3fa48 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t0 copy.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t0.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t0.png new file mode 100644 index 0000000000000000000000000000000000000000..7414c27a3ee624a7a6f9d3d44afaa57caac3fa48 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t0.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t3.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t3.png new file mode 100644 index 0000000000000000000000000000000000000000..3f0af5579f862ab96317fefc9eb701bd96f8693d Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_attn_t3.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_report.txt b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..da9a28edb1723b75718e804a05c7200e86b7021d --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_report.txt @@ -0,0 +1,29 @@ +TASK 2 — ATTENTION + DRIFT REPORT +================================================== + +Input : dharmo rakṣati rakṣitaḥ +Output: धर्मो रक्षति रक्षितः + +Captured steps: 4 +Analysis quality: WEAK +Final output uniq-ratio: 1.000 +Degenerate output: NO +Multi-sample semantic score (n<=8): 0.0568 +Lock-in step (CER<=0.05): t=0 +Locked tokens: 80 Flexible tokens: 0 +TF-IDF vs attention stability corr: 0.2552 +TF-IDF status: OK + +Saved graphs: + - task2_attn_t*.png / task2_all_layers_t0.png + - task2_attn_evolution.png + - task2_semantic_drift.png + - task2_source_alignment.png + - task2_tfidf_vs_attention.png + +Step trajectory (first 10 rows) +------------------------------------------------------------ +t= 3 bert=0.0288 drift=0.9712 text=निर्यात निर्यात नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि +t= 2 bert=0.0288 drift=0.9712 text=निर्यात निर्यात नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि +t= 1 bert=0.0288 drift=0.9712 text=निर्यात निर्यात नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि +t= 0 bert=0.0288 drift=0.9712 text=निर्यात निर्यात नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_semantic_drift.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_semantic_drift.png new file mode 100644 index 0000000000000000000000000000000000000000..a978d48b573e717885761ddb9cc32cf41d61b491 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_semantic_drift.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_source_alignment.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_source_alignment.png new file mode 100644 index 0000000000000000000000000000000000000000..3a87b8eb758340f4a672ee984808b4920659aee5 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_source_alignment.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_tfidf_vs_attention.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_tfidf_vs_attention.png new file mode 100644 index 0000000000000000000000000000000000000000..a572402cce935b5f33041e10213546482577d110 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task2_tfidf_vs_attention.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_concept_space.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_concept_space.png new file mode 100644 index 0000000000000000000000000000000000000000..d6386e0f2c98a1df060025fe1c422759b3ece5ee Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_concept_space.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_diversity_curve.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_diversity_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..4c4ad6d17c5e0ad7d4ba4ef8a250dbd50cd5e864 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_diversity_curve.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_diversity_direction.npy b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_diversity_direction.npy new file mode 100644 index 0000000000000000000000000000000000000000..d286c611097f2c5bc35ba4ec016497919ed34248 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_diversity_direction.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e77cb587929972f9a63a0fa890981e565948d34e4f71c97ca4e5e327a802aa7 +size 4224 diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_pca_explained_variance.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_pca_explained_variance.png new file mode 100644 index 0000000000000000000000000000000000000000..d3e94cef50c1f552b03815e25b0cebcb1e722b56 Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_pca_explained_variance.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_report.txt b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..33d2624efe1c68d5bfe38a1dba346a829259b41c --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task3_report.txt @@ -0,0 +1,21 @@ +TASK 3 — CONCEPT VECTORS + PCA STEERING +================================================== + +PCA: 50 components, 90.9% variance +Diversity PC: 0 (|r|=-0.565 with diversity proxy) + +Direction validity: WEAK +Spectrum unique ratio (mean over 5 seeds): 0.800 +Spectrum semantic stability (mean over 5 seeds): 0.222 + +Saved graphs: + - task3_concept_space.png + - task3_pca_explained_variance.png + - task3_diversity_curve.png + +Diversity spectrum: + alpha=-2.0 → निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात + alpha=-1.0 → निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात + alpha=+0.0 → निर्यात निर्यात निर्यात किं निर्यात निर्यात निर्यात किं चत्वार निर्यात दा विष्णुर् निर्यात नाशयामसि निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात निर्यात किं किं किं किं किं किं निर्यात दा किं निर्यात विष्णुर् निर्यात किं निर्यात विष्णुर् विष्ण किं किं नाशयामसि किं किं निर्यात किं निर्यात दा निर्यात निर्यात त्र निर्यात निर्यात किं निर्यात निर्यात किं विष्णुर् किं विष्णुर् किं किं किं विष्ण विष्णुर् विष्णुर् विष्णुर् विष्णुर् निर्यात कुर्वा विष्णुर् निर्यात विष्णुर् दा किं किं कुर्वा किं किं विष्ण विष्णुर् किं + alpha=+1.0 → किं किं नेमि नेमि नेमि नेमि किं नेमि दा विष्ण नेमि दा विष्णुर् ति̱रो देवि देवि नेमि किं नेमि विष्णुर् विष्णुर् नेमि किं किं विष्णुर् दा नेमि नाशयामसि किं नेमि नेमि नेमि नेमि नेमि नेमि किं किं विष्ण विष्ण नेमि दा नेमि नेमि किं विष्ण नेमि नेमि किं नेमि विष्ण नेमि कुर्वा विष्ण विष्णुर् किं कुर्वा विष्णुर् नेमि किं किं किं किं दा विष्णुर् किं विष्णुर् दा कुर्वा पर्यु दा दा दा किं किं किं विष्ण किं नेमि नेमि नेमि + alpha=+2.0 → नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि ेह विष्ण नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि नेमि diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_guidance_results.json b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_guidance_results.json new file mode 100644 index 0000000000000000000000000000000000000000..89d1de774f6962db4e89fafd9a567b68e79d9f1b --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_guidance_results.json @@ -0,0 +1,44 @@ +{ + "0.0": { + "mean_cer": 0.930712354616864, + "diversity": 0.5361420448852726, + "sent_unique": 1.0, + "distinct2": 0.12197053698716934, + "self_bleu": 0.049686447216624144 + }, + "0.5": { + "mean_cer": 0.9324607353533384, + "diversity": 0.5409936669463189, + "sent_unique": 1.0, + "distinct2": 0.13827912193558725, + "self_bleu": 0.05629178804294949 + }, + "1.0": { + "mean_cer": 0.9327692174661127, + "diversity": 0.539524600660052, + "sent_unique": 1.0, + "distinct2": 0.13571082781991287, + "self_bleu": 0.056661626499808854 + }, + "1.5": { + "mean_cer": 0.9337645839445354, + "diversity": 0.5236698765592963, + "sent_unique": 1.0, + "distinct2": 0.11392405063291139, + "self_bleu": 0.06658429751431877 + }, + "2.0": { + "mean_cer": 0.9359116945771591, + "diversity": 0.518451682792119, + "sent_unique": 0.9875, + "distinct2": 0.09948068808828303, + "self_bleu": 0.0625773225040451 + }, + "3.0": { + "mean_cer": 0.9388802190743608, + "diversity": 0.48712548074553663, + "sent_unique": 0.9875, + "distinct2": 0.07141684452470859, + "self_bleu": 0.0971658830336353 + } +} \ No newline at end of file diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_classifier.pt b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_classifier.pt new file mode 100644 index 0000000000000000000000000000000000000000..74bcf190ebce6158138d9964dcaff5018f9035ee --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_classifier.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f94c2074f2a31c81f013cad84eb4556181a80adef257804e7d27ec77220b4801 +size 561505 diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_data.npz b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_data.npz new file mode 100644 index 0000000000000000000000000000000000000000..e5dca9966a31ae8a08d6a1127bc6cff4b60c6e21 --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_data.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b80d72357b80861c965423e5fe3a80d193035fc29d960a80628bec93c142ff +size 2050512 diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_diversity_tradeoff.png b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_diversity_tradeoff.png new file mode 100644 index 0000000000000000000000000000000000000000..5769a18b7a8e931598085abbb19ea80962ce08ca Binary files /dev/null and b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_quality_diversity_tradeoff.png differ diff --git a/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_report.txt b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_report.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5cc600b0f376c80b2ee0fd5a0f1b96b126a449e --- /dev/null +++ b/analysis_outputs/outputs_all_models_20260325/encoder_decoder/T4/task5_report.txt @@ -0,0 +1,15 @@ +TASK 5 — CLASSIFIER-FREE GUIDANCE +================================================== + +Classifier params: 139521 +Training samples : 500 + +Guidance scale sweep: + λ CER diversity d2 sBLEU + ---------------------------------------------------- + 0.0 0.9307 0.536 0.122 0.050 ← optimal + 0.5 0.9325 0.541 0.138 0.056 + 1.0 0.9328 0.540 0.136 0.057 + 1.5 0.9338 0.524 0.114 0.067 + 2.0 0.9359 0.518 0.099 0.063 + 3.0 0.9389 0.487 0.071 0.097 diff --git a/app.py b/app.py index e278f4d1cbd6836d5bb0aaa0b0000a0e7a9d991f..2024ad199b55ff81fa8151c93f58dc751f30b73d 100644 --- a/app.py +++ b/app.py @@ -28,7 +28,7 @@ from model.tokenizer import SanskritSourceTokenizer, SanskritTargetTokenizer RESULTS_DIR = "generated_results" -DEFAULT_ANALYSIS_OUT = "analysis_outputs/T4" +DEFAULT_ANALYSIS_OUT = "analysis_outputs/outputs_all_models_20260325/T4" os.makedirs(RESULTS_DIR, exist_ok=True) _BG_JOBS = {} @@ -126,6 +126,32 @@ def _task5_cfg(lambda_min, lambda_max, lambda_step, task5_samples): HF_DEFAULT_MODEL_REPO = os.environ.get("HF_DEFAULT_MODEL_REPO", "bhsinghgrid/DevaFlow") HF_DEFAULT_MODEL_FILE = os.environ.get("HF_DEFAULT_MODEL_FILE", "best_model.pt") +HF_CHECKPOINT_REPO = os.environ.get("HF_CHECKPOINT_REPO", "bhsinghgrid/devflow2") +HF_CHECKPOINT_FILE = os.environ.get("HF_CHECKPOINT_FILE", "best_model.pt") +HF_MODEL_REPOS = [ + repo.strip() + for repo in os.environ.get("HF_MODEL_REPOS", "bhsinghgrid/DevaFlow,bhsinghgrid/devflow2").split(",") + if repo.strip() +] +HF_DEFAULT_MODEL_TYPE = os.environ.get("HF_DEFAULT_MODEL_TYPE", "d3pm_cross_attention") +HF_DEFAULT_INCLUDE_NEG = os.environ.get("HF_DEFAULT_INCLUDE_NEG", "false") +HF_DEFAULT_TYPE = os.environ.get("HF_DEFAULT_TYPE", "d3pm_encoder_decoder") +HF_DEFAULT_NEG = os.environ.get("HF_DEFAULT_NEG", "false") +HF_DEFAULT_NUM_STEPS = os.environ.get("HF_DEFAULT_NUM_STEPS") +HF_DEFAULT_MODEL_SETTINGS_FILE = os.environ.get("HF_DEFAULT_MODEL_SETTINGS_FILE", "model_settings.json") + +# import os +# # from huggingface_hub import hf_hub_download +# +# HF_CHECKPOINT_REPO = os.environ.get("HF_CHECKPOINT_REPO", "bhsinghgrid/devflow2") +# HF_CHECKPOINT_FILE = os.environ.get("HF_CHECKPOINT_FILE", "best_model.pt") + +checkpoint_path = hf_hub_download( + repo_id=HF_CHECKPOINT_REPO, + filename=HF_CHECKPOINT_FILE, + repo_type="model", +) + def _download_hf_default_checkpoint(): @@ -143,9 +169,71 @@ def _download_hf_default_checkpoint(): return None +def _download_hf_model_settings(): + try: + cache_dir = Path(".hf_model_cache") + cache_dir.mkdir(parents=True, exist_ok=True) + settings_path = hf_hub_download( + repo_id=HF_DEFAULT_MODEL_REPO, + filename=HF_DEFAULT_MODEL_SETTINGS_FILE, + local_dir=str(cache_dir), + local_dir_use_symlinks=False, + ) + with open(settings_path, "r", encoding="utf-8") as f: + data = json.load(f) + return data if isinstance(data, dict) else {} + except Exception: + return {} + + +HF_DEFAULT_SETTINGS = _download_hf_model_settings() + + +def _repo_cache_dir(repo_id: str) -> Path: + safe = repo_id.replace("/", "__") + path = Path(".hf_model_cache") / safe + path.mkdir(parents=True, exist_ok=True) + return path + + +def _download_hf_checkpoint(repo_id: str, filename: str = "best_model.pt"): + try: + cache_dir = _repo_cache_dir(repo_id) + return hf_hub_download( + repo_id=repo_id, + filename=filename, + local_dir=str(cache_dir), + local_dir_use_symlinks=False, + ) + except Exception: + return None + + +def _download_hf_settings_for_repo(repo_id: str): + try: + cache_dir = _repo_cache_dir(repo_id) + settings_path = hf_hub_download( + repo_id=repo_id, + filename=HF_DEFAULT_MODEL_SETTINGS_FILE, + local_dir=str(cache_dir), + local_dir_use_symlinks=False, + ) + with open(settings_path, "r", encoding="utf-8") as f: + data = json.load(f) + return data if isinstance(data, dict) else {} + except Exception: + return {} + + def discover_checkpoints(): found = [] - for root in ("ablation_results", "results7", "results"): + local_roots = [ + ("ablation_results", "cross_attention"), + (os.path.join("ablation_results", "encoder_decoder"), "encoder_decoder"), + ("results7", "other"), + ("results", "other"), + ] + for root, family in local_roots: if not os.path.isdir(root): continue for entry in sorted(os.listdir(root)): @@ -158,30 +246,54 @@ def discover_checkpoints(): "path": ckpt, "experiment": entry, "root": root, + "family": family, } ) - # Space-safe fallback: always expose one downloadable checkpoint option. - hf_ckpt = _download_hf_default_checkpoint() - if hf_ckpt and os.path.exists(hf_ckpt): + for repo_id in HF_MODEL_REPOS: + settings = _download_hf_settings_for_repo(repo_id) + model_type = settings.get("model_type", "") + family = "encoder_decoder" if model_type == "d3pm_encoder_decoder" else "cross_attention" + num_steps = settings.get("num_steps", HF_DEFAULT_NUM_STEPS) + step_label = f"T{num_steps}" if num_steps else "HF" found.append( { - "label": f"HF default [{HF_DEFAULT_MODEL_REPO}]", - "path": hf_ckpt, - "experiment": "hf_default", + "label": f"{repo_id} [{family}:{step_label}]", + "path": None, + "experiment": step_label, "root": "hf", + "family": family, + "repo_id": repo_id, + "repo_file": HF_CHECKPOINT_FILE, + "hf_settings": settings, } ) return found -def _guess_analysis_dir(experiment: str, ckpt_path: str) -> str: +def _guess_analysis_dir(experiment: str, ckpt_path: str, family: str = "cross_attention", settings: dict | None = None) -> str: + settings = settings or {} base = Path("analysis_outputs") + packaged = base / "outputs_all_models_20260325" + step = None + if experiment and experiment.startswith("T") and experiment[1:].isdigit(): + step = experiment + elif settings.get("num_steps"): + step = f"T{int(settings['num_steps'])}" + else: + for part in Path(ckpt_path or "").parts: + if part.startswith("T") and part[1:].isdigit(): + step = part + break + if packaged.exists() and step: + if family == "encoder_decoder" and (packaged / "encoder_decoder" / step).is_dir(): + return str(packaged / "encoder_decoder" / step) + if (packaged / step).is_dir(): + return str(packaged / step) if base.exists(): - if experiment and (base / experiment).is_dir(): - return str(base / experiment) - for part in Path(ckpt_path).parts: - if part.startswith("T") and part[1:].isdigit() and (base / part).is_dir(): - return str(base / part) + if family == "encoder_decoder" and step and (base / "encoder_decoder" / step).is_dir(): + return str(base / "encoder_decoder" / step) + if step and (base / step).is_dir(): + return str(base / step) if (base / "T4").is_dir(): return str(base / "T4") return os.path.join("analysis", "outputs_ui", experiment or "default") @@ -225,12 +337,38 @@ def infer_include_negative(experiment_name: str, root: str = "") -> bool: return CONFIG["data"]["include_negative_examples"] -def build_runtime_cfg(ckpt_path: str): - experiment = os.path.basename(os.path.dirname(ckpt_path)) - root = os.path.basename(os.path.dirname(os.path.dirname(ckpt_path))) +def build_runtime_cfg(ckpt_path: str, item: dict | None = None): + item = item or {} + if item.get("root") == "hf": + experiment = item.get("experiment", "hf") + root = "hf" + hf_settings = item.get("hf_settings", {}) + else: + experiment = os.path.basename(os.path.dirname(ckpt_path)) + root = os.path.basename(os.path.dirname(os.path.dirname(ckpt_path))) + hf_settings = {} cfg = copy.deepcopy(CONFIG) - cfg["model_type"] = infer_model_type(experiment, root=root) - cfg["data"]["include_negative_examples"] = infer_include_negative(experiment, root=root) + if root == "hf": + cfg["model_type"] = os.environ.get( + "HF_DEFAULT_MODEL_TYPE", + hf_settings.get("model_type", HF_DEFAULT_SETTINGS.get("model_type", HF_DEFAULT_MODEL_TYPE)), + ) + include_neg_raw = os.environ.get( + "HF_DEFAULT_INCLUDE_NEG", + str(hf_settings.get("include_negative_examples", HF_DEFAULT_SETTINGS.get("include_negative_examples", HF_DEFAULT_INCLUDE_NEG))), + ) + cfg["data"]["include_negative_examples"] = include_neg_raw.lower() == "true" + t_raw = os.environ.get( + "HF_DEFAULT_NUM_STEPS", + hf_settings.get("num_steps", HF_DEFAULT_SETTINGS.get("num_steps", HF_DEFAULT_NUM_STEPS)), + ) + if t_raw: + t_val = int(t_raw) + cfg["model"]["diffusion_steps"] = t_val + cfg["inference"]["num_steps"] = t_val + else: + cfg["model_type"] = infer_model_type(experiment, root=root) + cfg["data"]["include_negative_examples"] = infer_include_negative(experiment, root=root) if root == "ablation_results" and experiment.startswith("T") and experiment[1:].isdigit(): t_val = int(experiment[1:]) @@ -262,14 +400,23 @@ def load_selected_model(checkpoint_label): if checkpoint_label not in mapping: raise gr.Error("Selected checkpoint not found. Click refresh.") - ckpt_path = mapping[checkpoint_label]["path"] - cfg, device, experiment = build_runtime_cfg(ckpt_path) + item = mapping[checkpoint_label] + ckpt_path = item.get("path") + if item.get("root") == "hf": + ckpt_path = _download_hf_checkpoint(item["repo_id"], item.get("repo_file", HF_CHECKPOINT_FILE)) + if not ckpt_path or not os.path.exists(ckpt_path): + raise gr.Error(f"Failed to download checkpoint from {item['repo_id']}.") + item["path"] = ckpt_path + cfg, device, experiment = build_runtime_cfg(ckpt_path, item=item) model, cfg = load_model(ckpt_path, cfg, device) src_tok, tgt_tok = _build_tokenizers(cfg) bundle = { "ckpt_path": ckpt_path, "experiment": experiment, + "family": item.get("family", "cross_attention"), + "repo_id": item.get("repo_id"), + "hf_settings": item.get("hf_settings", {}), "device": str(device), "cfg": cfg, "model": model, @@ -288,9 +435,16 @@ def load_selected_model(checkpoint_label): "d_model": cfg["model"]["d_model"], "n_layers": cfg["model"]["n_layers"], "n_heads": cfg["model"]["n_heads"], + "family": item.get("family", "cross_attention"), + "repo_id": item.get("repo_id"), } status = f"Loaded `{experiment}` on `{device}` (`{cfg['model_type']}`)" - suggested_out = _guess_analysis_dir(experiment, ckpt_path) + suggested_out = _guess_analysis_dir( + experiment, + ckpt_path, + family=item.get("family", "cross_attention"), + settings=item.get("hf_settings", {}), + ) return bundle, status, model_info, cfg["inference"]["num_steps"], suggested_out @@ -495,7 +649,12 @@ def _run_analysis_cmd(task, ckpt_path, output_dir, input_text="dharmo rakṣati def _bundle_task_outputs(model_bundle, output_dir): - src_dir = _guess_analysis_dir(model_bundle.get("experiment", ""), model_bundle.get("ckpt_path", "")) + src_dir = _guess_analysis_dir( + model_bundle.get("experiment", ""), + model_bundle.get("ckpt_path", ""), + family=model_bundle.get("family", "cross_attention"), + settings=model_bundle.get("hf_settings", {}), + ) if not os.path.isdir(src_dir): return os.makedirs(output_dir, exist_ok=True) @@ -1011,6 +1170,21 @@ def _generate_with_flow( return out_text, status, meta, flow +def load_selected_model_with_outputs(checkpoint_label): + bundle, status, info, steps, out_dir = load_selected_model(checkpoint_label) + outputs = _safe_refresh_task_outputs(out_dir) + flow = _build_flow_markdown(model_loaded=True, inference_ready=False, task_states={}) + return bundle, status, info, steps, out_dir, flow, *outputs + + +def auto_load_default_with_outputs(): + choices = list(checkpoint_map().keys()) + if not choices: + empty = _safe_refresh_task_outputs(DEFAULT_ANALYSIS_OUT) + return None, "No checkpoints found.", {}, 64, DEFAULT_ANALYSIS_OUT, _build_flow_markdown(model_loaded=False, inference_ready=False, task_states={}), *empty + return load_selected_model_with_outputs(default_checkpoint_label()) + + CUSTOM_CSS = """ :root { --bg1: #f5fbff; @@ -1176,9 +1350,26 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo: refresh_btn.click(fn=refresh_checkpoints, outputs=[checkpoint_dropdown, load_status]) load_btn.click( - fn=load_selected_model, + fn=load_selected_model_with_outputs, inputs=[checkpoint_dropdown], - outputs=[model_state, load_status, model_info, num_steps, analysis_output_dir], + outputs=[ + model_state, + load_status, + model_info, + num_steps, + analysis_output_dir, + flow_box, + task1_box, + task2_box, + task2_drift_img, + task2_attn_img, + task2_tmax_img, + task2_evolution_img, + task3_box, + task3_img, + task5_box, + task4_img, + ], ) preset.change( @@ -1284,13 +1475,14 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo: ], ) demo.load( - fn=auto_load_default, - outputs=[model_state, load_status, model_info, num_steps, analysis_output_dir], - ) - demo.load( - fn=_safe_refresh_task_outputs, - inputs=[analysis_output_dir], + fn=auto_load_default_with_outputs, outputs=[ + model_state, + load_status, + model_info, + num_steps, + analysis_output_dir, + flow_box, task1_box, task2_box, task2_drift_img,