stanfordnlp/snli
Viewer • Updated • 570k • 24.4k • 94
Dhvani v7 fixes the critical surface↔abhida head collapse in v6 (ρ=0.985 → -0.009) using hinge-based cross-covariance decorrelation. The three heads now produce genuinely independent embedding subspaces.
| Metric | v6 | v7 | Delta |
|---|---|---|---|
| Surface↔Abhida correlation | 0.985 | -0.009 | Fixed ✅ |
| STS17 (surface) | 0.868 | 0.883 | +1.5 |
| STS17 (abhida) | 0.858 | 0.854 | -0.4 |
| STS17 (vyanjana) | 0.804 | 0.817 | +1.3 |
| Abhida meaning separation | — | 0.692 | New metric |
| Vyanjana register gap | 1.6 | 1.656 | Maintained |
| Register probe accuracy | 1.0 | 1.0 | Maintained |
Input → Qwen3-1.7B (LoRA r=16, α=32) → Mean Pool (2048)
→ Shared Trunk (Linear 2048→1024 + LN + GELU)
→ Surface Head (Linear 1024→512 + LN) — lexical/syntactic similarity
→ Abhida Head (Linear 1024→512 + LN) — deep meaning (decorrelated from surface)
→ Vyanjana Head (Linear 1024→512 + LN) — register/tone
All heads → L2 normalized
v6 trained with a weak orthogonality penalty (weight=0.1, dot-product only) → heads collapsed.
v7 uses hinge-based cross-covariance decorrelation:
{
'step': 2500,
'config': {...},
'metrics': {'cos_surf_abhi': -0.009, 'sts17_surface': 0.883, ...},
'lora': model.base.state_dict(),
'trunk': model.trunk.state_dict(),
'surface_head': model.surface_head.state_dict(),
'abhida_head': model.abhida_head.state_dict(),
'vyanjana_head': model.vyanjana_head.state_dict(),
'optimizer': ...,
'scheduler': ...,
}
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import hf_hub_download
class DhvaniV7(nn.Module):
def __init__(self, cfg):
super().__init__()
base = AutoModel.from_pretrained(
cfg['base_model'], torch_dtype=torch.bfloat16,
attn_implementation='eager', trust_remote_code=True
)
lora_config = LoraConfig(
r=cfg['lora_r'], lora_alpha=cfg['lora_alpha'],
lora_dropout=cfg['lora_dropout'],
target_modules=cfg['lora_targets'],
bias='none', task_type=TaskType.FEATURE_EXTRACTION
)
self.base = get_peft_model(base, lora_config)
self.trunk = nn.Sequential(
nn.Linear(cfg['hidden_dim'], cfg['trunk_dim']),
nn.LayerNorm(cfg['trunk_dim']), nn.GELU(),
)
self.surface_head = nn.Sequential(
nn.Linear(cfg['trunk_dim'], cfg['subspace_dim']),
nn.LayerNorm(cfg['subspace_dim']),
)
self.abhida_head = nn.Sequential(
nn.Linear(cfg['trunk_dim'], cfg['subspace_dim']),
nn.LayerNorm(cfg['subspace_dim']),
)
self.vyanjana_head = nn.Sequential(
nn.Linear(cfg['trunk_dim'], cfg['subspace_dim']),
nn.LayerNorm(cfg['subspace_dim']),
)
@staticmethod
def mean_pool(hidden, mask):
m = mask.unsqueeze(-1).float()
return (hidden * m).sum(1) / m.sum(1).clamp(min=1e-9)
def encode_tokens(self, input_ids, attention_mask):
out = self.base(input_ids=input_ids, attention_mask=attention_mask)
pooled = self.mean_pool(out.last_hidden_state.float(), attention_mask)
trunk = self.trunk(pooled)
return {
'surface': F.normalize(self.surface_head(trunk), p=2, dim=-1),
'abhida': F.normalize(self.abhida_head(trunk), p=2, dim=-1),
'vyanjana': F.normalize(self.vyanjana_head(trunk), p=2, dim=-1),
'full': F.normalize(torch.cat([
self.surface_head(trunk),
self.abhida_head(trunk),
self.vyanjana_head(trunk),
], dim=-1), p=2, dim=-1),
}
# Load
ckpt_path = hf_hub_download(repo_id="rb512/dhvani-v7", filename="v7_best.pt")
ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
cfg = ckpt["config"]
tokenizer = AutoTokenizer.from_pretrained(cfg['base_model'], trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = DhvaniV7(cfg)
model.base.load_state_dict(ckpt["lora"])
model.trunk.load_state_dict(ckpt["trunk"])
model.surface_head.load_state_dict(ckpt["surface_head"])
model.abhida_head.load_state_dict(ckpt["abhida_head"])
model.vyanjana_head.load_state_dict(ckpt["vyanjana_head"])
model.eval()
# Encode
texts = ["The cat sat on the mat.", "A feline rested upon the rug."]
enc = tokenizer(texts, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
with torch.no_grad():
embs = model.encode_tokens(enc['input_ids'], enc['attention_mask'])
# Surface: high similarity (paraphrases)
# Abhida: high similarity (same meaning, decorrelated from surface)
# Vyanjana: similar (same register)
print(f"Surface sim: {(embs['surface'][0] @ embs['surface'][1]).item():.3f}")
print(f"Abhida sim: {(embs['abhida'][0] @ embs['abhida'][1]).item():.3f}")
print(f"Vyanjana sim: {(embs['vyanjana'][0] @ embs['vyanjana'][1]).item():.3f}")
Named after Ānandavardhana's 9th-century theory of dhvani (resonance) in Sanskrit poetics:
@article{dhvani2026,
title={Dhvani: Structured Multi-Head Embeddings that Separate What Was Said from How It Was Said},
author={Baxi, Rahul},
year={2026},
note={VyasaLabs Technical Report}
}
Apache 2.0