Spaces:

vigneshwar234
/

TemporalMesh-Transformer-Demo

Running

App Files Files Community

vigneshwar234 commited on 19 days ago

Commit

1e98d80

verified ·

1 Parent(s): 97f4e30

Add app.py

Browse files

Files changed (1) hide show

app.py +458 -0

app.py ADDED Viewed

	@@ -0,0 +1,458 @@

+"""
+TemporalMesh Transformer — Interactive Demo Space
+Hugging Face Space: vigneshwar234/TemporalMesh-Transformer-Demo
+"""
+import gradio as gr
+import torch
+import torch.nn.functional as F
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from io import BytesIO
+from PIL import Image
+import random, math, textwrap
+# ── Minimal self-contained TMT implementation for the demo ──────────────────
+class TMTConfig:
+    def __init__(self):
+        self.vocab_size     = 1000
+        self.d_model        = 128
+        self.n_heads        = 4
+        self.n_layers       = 6
+        self.max_seq_len    = 64
+        self.graph_k        = 4
+        self.exit_threshold = 0.80
+        self.memory_anchors = 8
+        self.dropout        = 0.0
+class MeshBuilder(torch.nn.Module):
+    def __init__(self, k): super().__init__(); self.k = k
+    def forward(self, x):
+        B, S, D = x.shape
+        xn = F.normalize(x, dim=-1)
+        sim = torch.bmm(xn, xn.transpose(1,2))
+        sim.fill_diagonal_(-1e9)
+        topk = sim.topk(min(self.k, S-1), dim=-1)
+        return topk.indices, topk.values
+class MeshAttention(torch.nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.h  = cfg.n_heads
+        self.d  = cfg.d_model // cfg.n_heads
+        self.Wq = torch.nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.Wk = torch.nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.Wv = torch.nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.Wo = torch.nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+    def forward(self, x, edge_idx):
+        B, S, D = x.shape
+        Q = self.Wq(x).view(B,S,self.h,self.d).transpose(1,2)
+        K = self.Wk(x).view(B,S,self.h,self.d).transpose(1,2)
+        V = self.Wv(x).view(B,S,self.h,self.d).transpose(1,2)
+        attn = torch.matmul(Q, K.transpose(-2,-1)) / math.sqrt(self.d)
+        mask = torch.full((B,self.h,S,S), -1e9, device=x.device)
+        idx  = edge_idx.unsqueeze(1).expand(B,self.h,S,-1)
+        src  = torch.arange(S,device=x.device).view(1,1,S,1).expand_as(idx)
+        mask.scatter_(3, idx, attn.gather(3, idx))
+        attn = F.softmax(mask, dim=-1)
+        out  = torch.matmul(attn, V).transpose(1,2).reshape(B,S,D)
+        return self.Wo(out), attn.mean(1)
+class ExitGate(torch.nn.Module):
+    def __init__(self, d): super().__init__(); self.g = torch.nn.Linear(d,1)
+    def forward(self, x): return torch.sigmoid(self.g(x)).squeeze(-1)
+class TMTLayer(torch.nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.attn  = MeshAttention(cfg)
+        self.ff    = torch.nn.Sequential(
+            torch.nn.Linear(cfg.d_model, cfg.d_model*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(cfg.d_model*2, cfg.d_model),
+        )
+        self.gate  = ExitGate(cfg.d_model)
+        self.ln1   = torch.nn.LayerNorm(cfg.d_model)
+        self.ln2   = torch.nn.LayerNorm(cfg.d_model)
+    def forward(self, x, edge_idx, frozen):
+        a, attn_w = self.attn(self.ln1(x), edge_idx)
+        x = x + a
+        x = x + self.ff(self.ln2(x))
+        conf = self.gate(x)
+        return x, conf, attn_w
+class TMTModel(torch.nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg    = cfg
+        self.emb    = torch.nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.mesh   = MeshBuilder(cfg.graph_k)
+        self.layers = torch.nn.ModuleList([TMTLayer(cfg) for _ in range(cfg.n_layers)])
+        self.ln     = torch.nn.LayerNorm(cfg.d_model)
+        self.head   = torch.nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+    def forward(self, ids):
+        x = self.emb(ids)
+        B, S, D = x.shape
+        frozen  = torch.zeros(B, S, dtype=torch.bool)
+        exits   = []
+        confs   = []
+        attns   = []
+        edge_idx, _ = self.mesh(x)
+        for layer in self.layers:
+            x_new, conf, attn_w = layer(x, edge_idx, frozen)
+            new_exits = (~frozen) & (conf > self.cfg.exit_threshold)
+            frozen    = frozen | new_exits
+            x         = torch.where(frozen.unsqueeze(-1), x, x_new)
+            exits.append(new_exits.float())
+            confs.append(conf)
+            attns.append(attn_w)
+            edge_idx, _ = self.mesh(x)
+        logits = self.head(self.ln(x))
+        return logits, exits, confs, attns
+# Instantiate once at startup
+torch.manual_seed(42)
+CFG   = TMTConfig()
+MODEL = TMTModel(CFG)
+MODEL.eval()
+SAMPLE_SENTENCES = [
+    "The neural network learned to represent complex patterns in the data",
+    "Attention mechanisms allow transformers to focus on relevant tokens",
+    "Dynamic graph topology adapts to the semantic content of the sequence",
+    "Machine learning models require large amounts of training data",
+    "The quick brown fox jumps over the lazy dog near the river",
+    "Adaptive depth routing reduces compute by 50 percent on average",
+    "Language models predict the next word given the previous context",
+    "Graph neural networks operate over structured relational data",
+]
+WORD_TYPES = {
+    "the":0,"a":0,"an":0,"of":0,"in":0,"to":0,"and":0,"is":0,"are":0,"by":0,
+    "on":0,"at":0,"for":0,"with":0,"this":0,"that":0,"it":0,"its":0,
+    "learned":1,"focus":1,"allow":1,"predict":1,"require":1,"adapts":1,
+    "reduces":1,"operate":1,"jumps":1,"represent":1,
+    "neural":2,"network":2,"attention":2,"transformer":2,"semantic":2,
+    "topology":2,"graph":2,"compute":2,"language":2,"model":2,
+    "mechanisms":3,"dynamic":3,"adaptive":3,"structured":3,"relational":3,
+    "patterns":3,"complex":3,"relevant":3,"previous":3,
+}
+TYPE_COLORS = ["#22c55e","#3b82f6","#f59e0b","#ef4444"]
+TYPE_LABELS = ["Function words","Common verbs","Domain terms","Complex"]
+def encode(text):
+    words = text.lower().split()[:CFG.max_seq_len]
+    ids   = [hash(w) % (CFG.vocab_size-2) + 1 for w in words]
+    return words, torch.tensor([ids])
+def run_model(text):
+    words, ids = encode(text)
+    with torch.no_grad():
+        logits, exits, confs, attns = MODEL(ids)
+    return words, exits, confs, attns
+# ── FIGURE 1: Exit gate heatmap ─────────────────────────────────────────────
+def plot_exit_heatmap(words, exits, confs):
+    S   = len(words)
+    N   = len(exits)
+    mat = torch.stack(exits, dim=0).squeeze(1).numpy()  # (N, S)
+    con = torch.stack(confs, dim=0).squeeze(1).numpy()
+    fig, axes = plt.subplots(1, 2, figsize=(14, max(3, S*0.35+1.5)))
+    fig.patch.set_facecolor('#0f172a')
+    # Exit heatmap
+    ax = axes[0]
+    ax.set_facecolor('#1e293b')
+    im = ax.imshow(mat, aspect='auto', cmap='RdYlGn', vmin=0, vmax=1,
+                   interpolation='nearest')
+    ax.set_yticks(range(N)); ax.set_yticklabels([f"L{i+1}" for i in range(N)],
+                                                 color='white', fontsize=9)
+    ax.set_xticks(range(S)); ax.set_xticklabels(
+        [w[:8] for w in words], rotation=45, ha='right', color='white', fontsize=8)
+    ax.set_title("Exit Gate — Green = token froze at this layer",
+                 color='white', fontsize=11, pad=8)
+    plt.colorbar(im, ax=ax, fraction=0.03)
+    # Confidence line chart
+    ax2 = axes[1]
+    ax2.set_facecolor('#1e293b')
+    avg_conf = con.mean(axis=1)
+    layers   = range(1, N+1)
+    ax2.plot(layers, avg_conf, 'o-', color='#60a5fa', lw=2.5, ms=7)
+    ax2.fill_between(layers, avg_conf, alpha=0.2, color='#60a5fa')
+    ax2.axhline(CFG.exit_threshold, color='#f59e0b', lw=1.5, ls='--',
+                label=f'Exit threshold ({CFG.exit_threshold})')
+    ax2.set_xlabel("Layer", color='white', fontsize=10)
+    ax2.set_ylabel("Avg Gate Confidence", color='white', fontsize=10)
+    ax2.set_title("Confidence per Layer", color='white', fontsize=11)
+    ax2.tick_params(colors='white'); ax2.legend(fontsize=9)
+    ax2.set_facecolor('#1e293b')
+    for spine in ax2.spines.values(): spine.set_color('#334155')
+    plt.tight_layout()
+    buf = BytesIO(); fig.savefig(buf, format='png', dpi=130, bbox_inches='tight',
+                                  facecolor='#0f172a'); buf.seek(0)
+    img = Image.open(buf); plt.close(fig)
+    return img
+# ── FIGURE 2: Dynamic attention graph ───────────────────────────────────────
+def plot_attention_graph(words, attns):
+    S = len(words)
+    k = CFG.graph_k
+    np.random.seed(42)
+    # Circular layout
+    angles = np.linspace(0, 2*np.pi, S, endpoint=False)
+    pos    = np.stack([np.cos(angles), np.sin(angles)], axis=1)
+    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+    fig.patch.set_facecolor('#0f172a')
+    layers_to_show = [0, len(attns)//2, -1]
+    titles = ["Layer 1 — Initial Graph", f"Layer {len(attns)//2+1} — Mid", f"Layer {len(attns)} — Final"]
+    for col, (li, title) in enumerate(zip(layers_to_show, titles)):
+        ax = axes[col]
+        ax.set_facecolor('#1e293b')
+        attn_w = attns[li].squeeze(0).detach().numpy()  # (S, S)
+        # Draw edges
+        for i in range(S):
+            top_k = np.argsort(attn_w[i])[::-1][:k]
+            for j in top_k:
+                w = attn_w[i,j]
+                ax.plot([pos[i,0], pos[j,0]], [pos[i,1], pos[j,1]],
+                        color='#3b82f6', alpha=min(0.9, w*3+0.1), lw=w*3+0.3)
+        # Draw nodes
+        for i, word in enumerate(words):
+            wtype = WORD_TYPES.get(word.lower(), 1)
+            col_node = TYPE_COLORS[wtype]
+            ax.scatter(pos[i,0], pos[i,1], c=col_node, s=200, zorder=5,
+                       edgecolors='white', linewidths=1)
+            ax.text(pos[i,0]*1.22, pos[i,1]*1.22, word[:7],
+                    ha='center', va='center', fontsize=7.5, color='white')
+        ax.set_xlim(-1.5, 1.5); ax.set_ylim(-1.5, 1.5)
+        ax.set_title(title, color='white', fontsize=10, pad=6)
+        ax.axis('off')
+    # Legend
+    legend_patches = [mpatches.Patch(color=TYPE_COLORS[i], label=TYPE_LABELS[i])
+                      for i in range(4)]
+    fig.legend(handles=legend_patches, loc='lower center', ncol=4,
+               fontsize=9, facecolor='#1e293b', labelcolor='white',
+               edgecolor='#334155', bbox_to_anchor=(0.5, -0.02))
+    plt.tight_layout()
+    buf = BytesIO(); fig.savefig(buf, format='png', dpi=130, bbox_inches='tight',
+                                  facecolor='#0f172a'); buf.seek(0)
+    img = Image.open(buf); plt.close(fig)
+    return img
+# ── FIGURE 3: Token compute depth ───────────────────────────────────────────
+def plot_token_depth(words, exits, confs):
+    S    = len(words)
+    N    = len(exits)
+    exit_mat = torch.stack(exits, dim=0).squeeze(1).numpy()
+    exit_layer = []
+    for i in range(S):
+        col = exit_mat[:, i]
+        first = np.argmax(col) + 1 if col.max() > 0 else N
+        exit_layer.append(int(first))
+    fig, ax = plt.subplots(figsize=(max(8, S*0.7), 4.5))
+    fig.patch.set_facecolor('#0f172a')
+    ax.set_facecolor('#1e293b')
+    colors = [TYPE_COLORS[WORD_TYPES.get(w.lower(), 1)] for w in words]
+    bars   = ax.bar(range(S), exit_layer, color=colors, alpha=0.9,
+                    edgecolor='white', linewidth=0.6)
+    ax.axhline(N, color='#94a3b8', lw=1.5, ls='--', label=f'Max depth ({N} layers)')
+    ax.axhline(np.mean(exit_layer), color='#f59e0b', lw=2, ls='-.',
+               label=f'Avg depth ({np.mean(exit_layer):.1f} layers = '
+                     f'{np.mean(exit_layer)/N*100:.0f}% compute)')
+    for bar, val in zip(bars, exit_layer):
+        ax.text(bar.get_x()+bar.get_width()/2, val+0.05, str(val),
+                ha='center', va='bottom', fontsize=9, color='white', fontweight='bold')
+    ax.set_xticks(range(S))
+    ax.set_xticklabels(words, rotation=40, ha='right', color='white', fontsize=9)
+    ax.set_ylabel("Layers used", color='white', fontsize=11)
+    ax.set_ylim(0, N+1.5)
+    ax.set_title("Adaptive Depth — Compute per Token\n"
+                 "Simple tokens exit early · Complex tokens go deep",
+                 color='white', fontsize=12)
+    ax.tick_params(colors='white')
+    for spine in ax.spines.values(): spine.set_color('#334155')
+    legend_patches = [mpatches.Patch(color=TYPE_COLORS[i], label=TYPE_LABELS[i])
+                      for i in range(4)]
+    legend_patches.append(
+        mpatches.Patch(color='#f59e0b', label=f'Avg: {np.mean(exit_layer):.1f}L'))
+    ax.legend(handles=legend_patches, fontsize=9, facecolor='#1e293b',
+              labelcolor='white', edgecolor='#334155', ncol=3)
+    plt.tight_layout()
+    buf = BytesIO(); fig.savefig(buf, format='png', dpi=130, bbox_inches='tight',
+                                  facecolor='#0f172a'); buf.seek(0)
+    img = Image.open(buf); plt.close(fig)
+    return img
+# ── Stats text ───────────────────────────────────────────────────────────────
+def compute_stats(words, exits, confs):
+    S = len(words); N = len(exits)
+    exit_mat = torch.stack(exits, dim=0).squeeze(1).numpy()
+    exit_layers = []
+    for i in range(S):
+        col = exit_mat[:, i]
+        exit_layers.append(int(np.argmax(col)+1) if col.max()>0 else N)
+    avg_depth   = np.mean(exit_layers)
+    compute_pct = avg_depth / N * 100
+    earliest    = words[int(np.argmin(exit_layers))]
+    deepest     = words[int(np.argmax(exit_layers))]
+    total_saved = sum(N - e for e in exit_layers)
+    stats = f"""
+## Analysis Results
+| Metric | Value |
+|:---|:---|
+| Tokens analysed | {S} |
+| Total layers | {N} |
+| Avg depth used | {avg_depth:.1f} / {N} layers |
+| **Compute used** | **{compute_pct:.0f}% of full depth** |
+| **Compute saved** | **{100-compute_pct:.0f}%** |
+| Layer calls saved | {total_saved} of {S*N} total |
+| Earliest exit token | `{earliest}` (layer {min(exit_layers)}) |
+| Deepest token | `{deepest}` (layer {max(exit_layers)}) |
+**Graph:** Each token connects to {CFG.graph_k} nearest neighbours by cosine similarity.
+The graph rebuilds after every layer as token representations evolve.
+**Paper:** [10.5281/zenodo.20287390](https://doi.org/10.5281/zenodo.20287390)
+**Model:** [vigneshwar234/TemporalMesh-Transformer](https://huggingface.co/vigneshwar234/TemporalMesh-Transformer)
+**Code:** [github.com/vignesh2027/TemporalMesh-Transformer](https://github.com/vignesh2027/TemporalMesh-Transformer)
+"""
+    return stats
+# ── Main inference function ──────────────────────────────────────────────────
+def analyse(text):
+    text = text.strip()
+    if not text:
+        text = random.choice(SAMPLE_SENTENCES)
+    words, exits, confs, attns = run_model(text)
+    img1  = plot_exit_heatmap(words, exits, confs)
+    img2  = plot_attention_graph(words, attns)
+    img3  = plot_token_depth(words, exits, confs)
+    stats = compute_stats(words, exits, confs)
+    return img1, img2, img3, stats
+def random_example():
+    return random.choice(SAMPLE_SENTENCES)
+# ── Gradio UI ────────────────────────────────────────────────────────────────
+CSS = """
+.gradio-container { background: #0f172a !important; color: white !important; }
+h1, h2, h3, p, label { color: #e2e8f0 !important; }
+.gr-button { background: #2563eb !important; color: white !important; border: none !important; }
+.gr-button:hover { background: #1d4ed8 !important; }
+footer { display: none !important; }
+"""
+HEADER = """
+<div style="text-align:center; padding: 20px 0 10px 0; background:#0f172a;">
+  <h1 style="font-size:2.2em; font-weight:800; color:#58a6ff; margin:0;">
+    TemporalMesh Transformer
+  </h1>
+  <p style="color:#8b949e; font-size:1.05em; margin:6px 0 0 0;">
+    Dynamic Graph Attention &nbsp;·&nbsp; Temporal Decay &nbsp;·&nbsp; Adaptive Depth Routing
+  </p>
+  <div style="margin-top:12px; display:flex; justify-content:center; gap:10px; flex-wrap:wrap;">
+    <a href="https://doi.org/10.5281/zenodo.20287390" target="_blank"
+       style="background:#1e3a5f;color:#58a6ff;padding:5px 14px;border-radius:20px;
+              text-decoration:none;font-size:0.88em;border:1px solid #2563eb;">
+      📄 Paper (Zenodo DOI)
+    </a>
+    <a href="https://huggingface.co/vigneshwar234/TemporalMesh-Transformer" target="_blank"
+       style="background:#1e3a5f;color:#fbbf24;padding:5px 14px;border-radius:20px;
+              text-decoration:none;font-size:0.88em;border:1px solid #f59e0b;">
+      🤗 Model Card
+    </a>
+    <a href="https://github.com/vignesh2027/TemporalMesh-Transformer" target="_blank"
+       style="background:#1e3a5f;color:#a78bfa;padding:5px 14px;border-radius:20px;
+              text-decoration:none;font-size:0.88em;border:1px solid #7c3aed;">
+      💻 GitHub Code
+    </a>
+    <a href="https://huggingface.co/datasets/vigneshwar234/TMT-Benchmarks" target="_blank"
+       style="background:#1e3a5f;color:#34d399;padding:5px 14px;border-radius:20px;
+              text-decoration:none;font-size:0.88em;border:1px solid #16a34a;">
+      📊 Benchmark Dataset
+    </a>
+  </div>
+</div>
+"""
+DESCRIPTION = """
+Enter any sentence to see **TMT's three core innovations in action**:
+- **Exit Gate Heatmap** — which tokens freeze at which layer (green = exited early)
+- **Dynamic Attention Graph** — how the kNN mesh evolves across layers as token meanings shift
+- **Token Compute Depth** — how many layers each word actually uses vs the full 12
+> TMT achieves **29.4 perplexity** on WikiText-2 at **~48% of standard compute**.
+> No prior architecture combines dynamic graph attention + temporal decay + per-token early exit.
+"""
+with gr.Blocks(css=CSS, title="TemporalMesh Transformer Demo") as demo:
+    gr.HTML(HEADER)
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=4):
+            txt = gr.Textbox(
+                label="Input sentence",
+                placeholder="Enter any sentence…",
+                lines=2,
+                value=SAMPLE_SENTENCES[0],
+            )
+        with gr.Column(scale=1, min_width=140):
+            rnd_btn = gr.Button("🎲 Random", variant="secondary")
+            run_btn = gr.Button("▶ Analyse", variant="primary")
+    stats_out = gr.Markdown(label="Stats")
+    with gr.Row():
+        img1 = gr.Image(label="Exit Gate Heatmap + Confidence", type="pil", height=320)
+        img3 = gr.Image(label="Token Compute Depth", type="pil", height=320)
+    img2 = gr.Image(label="Dynamic Attention Graph (3 stages)", type="pil", height=340)
+    gr.Examples(
+        examples=[[s] for s in SAMPLE_SENTENCES],
+        inputs=[txt],
+        label="Example sentences",
+    )
+    run_btn.click(fn=analyse, inputs=[txt], outputs=[img1, img2, img3, stats_out])
+    rnd_btn.click(fn=random_example, outputs=[txt])
+    txt.submit(fn=analyse, inputs=[txt], outputs=[img1, img2, img3, stats_out])
+    gr.HTML("""
+    <div style="text-align:center;padding:16px 0 8px;color:#64748b;font-size:0.85em;">
+      TemporalMesh Transformer · Vignesh, 2026 · MIT License ·
+      <a href="https://doi.org/10.5281/zenodo.20287390" style="color:#58a6ff;">
+        DOI: 10.5281/zenodo.20287390
+      </a>
+    </div>
+    """)
+demo.launch()