RameshArvind
/

iterative-pagerank

Model card Files Files and versions

xet

Community

RameshArvind commited on Mar 14

Commit

eefc84e

verified ·

1 Parent(s): 1c29bf1

Upload iterative_pagerank.py with huggingface_hub

Browse files

Files changed (1) hide show

iterative_pagerank.py +691 -0

iterative_pagerank.py ADDED Viewed

	@@ -0,0 +1,691 @@

+"""
+Non-autoregressive iterative PageRank model.
+PageRank IS power iteration — the same linear operation applied repeatedly
+until convergence. This maps perfectly to shared-weight iterative transformers.
+Architecture:
+  - Input: N nodes, each gets its adjacency row (directed graph)
+  - Shared transformer body (bidirectional attention)
+  - Output: each node predicts its PageRank value (regression)
+  - Iterative refinement mirrors power iteration
+  - Train with K=16, eval with K=16..256+
+Difficulty knobs:
+  - Damping factor d: higher d → more structure-dependent → harder
+  - Graph type: random (easy) → power-law/hub (hard)
+  - Edge density: affects spectral gap → convergence rate
+Usage:
+    python scripts/iterative_pagerank.py --device cpu --steps 500 --batch 64 --n-nodes 16
+    python scripts/iterative_pagerank.py --device cuda --steps 50000 --batch 2048 --compile
+"""
+import argparse
+import math
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+@dataclass
+class PageRankConfig:
+    n_nodes: int = 64
+    d_model: int = 128
+    n_heads: int = 4
+    n_layers: int = 4
+    d_ff: int = 512
+    dropout: float = 0.1
+    train_iters: int = 16
+    rope_base: float = 10.0
+    damping: float = 0.85       # PageRank damping factor
+    # Reverse curriculum: hard mixed graphs from start (sotaku-style)
+    # graph_type_weights: [random, preferential_attachment, hub_spoke]
+    curriculum: tuple = (
+        (0.0,  0.08, (0.2, 0.5, 0.3)),
+        (1.0,  0.08, (0.2, 0.5, 0.3)),
+    )
+# ---------------------------------------------------------------------------
+# RoPE
+# ---------------------------------------------------------------------------
+def build_rope_cache(seq_len, head_dim, base=10.0, device="cpu"):
+    theta = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+    positions = torch.arange(seq_len, device=device).float()
+    freqs = torch.outer(positions, theta)
+    return freqs.cos(), freqs.sin()
+def apply_rope(x, cos, sin):
+    d2 = x.shape[-1] // 2
+    x1, x2 = x[..., :d2], x[..., d2:]
+    cos, sin = cos[:x.shape[2], :], sin[:x.shape[2], :]
+    return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)
+# ---------------------------------------------------------------------------
+# Transformer layers
+# ---------------------------------------------------------------------------
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model, n_heads, dropout=0.1):
+        super().__init__()
+        self.n_heads = n_heads
+        self.head_dim = d_model // n_heads
+        self.wq = nn.Linear(d_model, d_model, bias=False)
+        self.wk = nn.Linear(d_model, d_model, bias=False)
+        self.wv = nn.Linear(d_model, d_model, bias=False)
+        self.wo = nn.Linear(d_model, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, cos, sin, adj_bias=None):
+        B, N, D = x.shape
+        q = self.wq(x).view(B, N, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.wk(x).view(B, N, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.wv(x).view(B, N, self.n_heads, self.head_dim).transpose(1, 2)
+        q, k = apply_rope(q, cos, sin), apply_rope(k, cos, sin)
+        attn = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=adj_bias,
+            dropout_p=self.dropout.p if self.training else 0.0,
+        )
+        return self.wo(attn.transpose(1, 2).contiguous().view(B, N, D))
+class TransformerBlock(nn.Module):
+    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
+        super().__init__()
+        self.norm1 = nn.RMSNorm(d_model)
+        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
+        self.norm2 = nn.RMSNorm(d_model)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_ff, bias=False),
+            nn.ReLU(),
+            nn.Linear(d_ff, d_model, bias=False),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x, cos, sin, adj_bias=None):
+        x = x + self.attn(self.norm1(x), cos, sin, adj_bias)
+        x = x + self.ff(self.norm2(x))
+        return x
+# ---------------------------------------------------------------------------
+# PageRank Model
+# ---------------------------------------------------------------------------
+class IterativePageRankModel(nn.Module):
+    def __init__(self, config: PageRankConfig):
+        super().__init__()
+        self.config = config
+        d = config.d_model
+        N = config.n_nodes
+        # Input: adjacency row (N) → d_model
+        self.input_proj = nn.Linear(N, d, bias=False)
+        # Prediction feedback: previous PR estimate (1 scalar per node) → d_model
+        self.pred_proj = nn.Linear(1, d, bias=False)
+        # Shared transformer
+        self.layers = nn.ModuleList([
+            TransformerBlock(d, config.n_heads, config.d_ff, config.dropout)
+            for _ in range(config.n_layers)
+        ])
+        self.final_norm = nn.RMSNorm(d)
+        # Output: d_model → 1 (PageRank logit per node)
+        self.output_head = nn.Linear(d, 1, bias=False)
+        cos, sin = build_rope_cache(N, d // config.n_heads, config.rope_base)
+        self.register_buffer("rope_cos", cos)
+        self.register_buffer("rope_sin", sin)
+    def _transformer_step(self, h_input, cos, sin, adj_bias):
+        x = h_input
+        for layer in self.layers:
+            x = layer(x, cos, sin, adj_bias)
+        x = self.final_norm(x)
+        return self.output_head(x)
+    def forward(self, adj, n_iters=None):
+        """
+        adj: (B, N, N) directed adjacency matrix
+        Returns: list of PR predictions (B, N), one per iteration
+        """
+        if n_iters is None:
+            n_iters = self.config.train_iters
+        B, N, _ = adj.shape
+        device = adj.device
+        # Adjacency bias for attention: edges get a boost
+        adj_bias = adj * 2.0
+        adj_bias = adj_bias.unsqueeze(1)  # (B, 1, N, N)
+        # Encode graph structure
+        h = self.input_proj(adj)  # (B, N, d)
+        all_prs = []
+        # Initial PR estimate: uniform
+        pr_pred = torch.full((B, N, 1), 1.0 / N, device=device)
+        for _ in range(n_iters):
+            h_input = h + self.pred_proj(pr_pred)
+            logits = self._transformer_step(h_input, self.rope_cos, self.rope_sin, adj_bias)
+            # logits: (B, N, 1) → softmax across nodes to get PR distribution
+            pr = F.softmax(logits.squeeze(-1), dim=-1)  # (B, N)
+            all_prs.append(pr)
+            pr_pred = pr.unsqueeze(-1).detach()
+        return all_prs
+# ---------------------------------------------------------------------------
+# Ground truth PageRank (power iteration)
+# ---------------------------------------------------------------------------
+def compute_pagerank(adj: torch.Tensor, damping: float = 0.85, n_iters: int = 30, tol: float = 1e-8):
+    """Compute PageRank via power iteration.
+    adj: (B, N, N) directed adjacency (adj[i,j]=1 means edge from i to j)
+    Returns: (B, N) PageRank values summing to 1
+    """
+    B, N, _ = adj.shape
+    device = adj.device
+    # Out-degree per node
+    out_deg = adj.sum(dim=-1, keepdim=True).clamp(min=1)  # (B, N, 1)
+    # Transition matrix: M[j,i] = adj[i,j] / out_deg[i]
+    # (probability of going from i to j)
+    M = (adj / out_deg).transpose(1, 2)  # (B, N, N)
+    # Power iteration
+    pr = torch.full((B, N), 1.0 / N, device=device)
+    teleport = (1 - damping) / N
+    for _ in range(n_iters):
+        pr_new = teleport + damping * (M @ pr.unsqueeze(-1)).squeeze(-1)
+        # Handle dangling nodes (no outgoing edges): redistribute their mass
+        dangling_mass = pr * (adj.sum(dim=-1) == 0).float()
+        pr_new = pr_new + damping * dangling_mass.sum(dim=-1, keepdim=True) / N
+        # Normalize
+        pr_new = pr_new / pr_new.sum(dim=-1, keepdim=True)
+        if (pr_new - pr).abs().max() < tol:
+            break
+        pr = pr_new
+    return pr
+# ---------------------------------------------------------------------------
+# Graph generation
+# ---------------------------------------------------------------------------
+def generate_random_graph(batch_size, n_nodes, edge_prob, device):
+    """Erdos-Renyi directed random graph."""
+    adj = (torch.rand(batch_size, n_nodes, n_nodes, device=device) < edge_prob).float()
+    adj[:, range(n_nodes), range(n_nodes)] = 0  # no self-loops
+    return adj
+def generate_preferential_attachment(batch_size, n_nodes, n_edges_per_node, device):
+    """Barabasi-Albert: sequential node addition with degree-proportional attachment.
+    Creates true power-law degree distribution. Uses CPU for generation, moves to device.
+    """
+    adj = torch.zeros(batch_size, n_nodes, n_nodes)
+    m = max(1, n_edges_per_node)
+    # Start with a small clique
+    for i in range(min(m + 1, n_nodes)):
+        for j in range(i + 1, min(m + 1, n_nodes)):
+            adj[:, i, j] = 1
+    for new_node in range(m + 1, n_nodes):
+        deg = adj[:, :new_node, :new_node].sum(dim=-1) + adj[:, :new_node, :new_node].sum(dim=-2)
+        deg = deg + 1
+        probs = deg / deg.sum(dim=-1, keepdim=True)
+        for _ in range(m):
+            targets = torch.multinomial(probs, 1).squeeze(-1)
+            adj[torch.arange(batch_size), new_node, targets] = 1
+            reverse = torch.rand(batch_size) < 0.5
+            adj[torch.arange(batch_size)[reverse], targets[reverse], new_node] = 1
+    return adj.to(device)
+def generate_hub_spoke(batch_size, n_nodes, n_hubs, device):
+    """Hub-spoke graphs: few hub nodes connected to many spokes.
+    Uses CPU for generation, moves to device.
+    """
+    adj = torch.zeros(batch_size, n_nodes, n_nodes)
+    hub_indices = torch.randint(0, n_nodes, (batch_size, n_hubs))
+    for i in range(n_nodes):
+        for h in range(n_hubs):
+            connect = torch.rand(batch_size) < 0.6
+            hub = hub_indices[:, h]
+            adj[connect, i, hub[connect]] = 1
+            back = connect & (torch.rand(batch_size) < 0.3)
+            adj[back, hub[back], i] = 1
+    for h1 in range(n_hubs):
+        for h2 in range(h1 + 1, n_hubs):
+            connect = torch.rand(batch_size) < 0.8
+            adj[connect, hub_indices[connect, h1], hub_indices[connect, h2]] = 1
+            adj[connect, hub_indices[connect, h2], hub_indices[connect, h1]] = 1
+    noise = (torch.rand(batch_size, n_nodes, n_nodes) < 0.02).float()
+    adj = (adj + noise).clamp(max=1)
+    adj[:, range(n_nodes), range(n_nodes)] = 0
+    return adj.to(device)
+def generate_batch(batch_size, config, edge_prob, graph_weights, device):
+    """Generate mixed batch of graph types."""
+    N = config.n_nodes
+    w_random, w_pref, w_hub = graph_weights
+    # Determine count per type
+    n_random = int(batch_size * w_random)
+    n_pref = int(batch_size * w_pref)
+    n_hub = batch_size - n_random - n_pref
+    adjs = []
+    if n_random > 0:
+        adjs.append(generate_random_graph(n_random, N, edge_prob, device))
+    if n_pref > 0:
+        n_edges = max(1, int(edge_prob * N * 0.5))
+        adjs.append(generate_preferential_attachment(n_pref, N, n_edges, device))
+    if n_hub > 0:
+        n_hubs = max(2, N // 16)
+        adjs.append(generate_hub_spoke(n_hub, N, n_hubs, device))
+    adj = torch.cat(adjs, dim=0) if len(adjs) > 1 else adjs[0]
+    # Shuffle
+    perm = torch.randperm(batch_size, device=device)
+    adj = adj[perm]
+    # Compute ground truth PageRank
+    targets = compute_pagerank(adj, damping=config.damping)
+    # Stats
+    with torch.no_grad():
+        entropy = -(targets * (targets + 1e-10).log()).sum(dim=-1).mean().item()
+        max_pr = targets.max(dim=-1).values.mean().item()
+        gini = _gini(targets)
+    metadata = {
+        "edge_prob": edge_prob,
+        "entropy": entropy,
+        "max_pr": max_pr,
+        "gini": gini,
+    }
+    return adj, targets, metadata
+def _gini(pr):
+    """Gini coefficient of PageRank distribution. 0=uniform, 1=concentrated."""
+    sorted_pr, _ = pr.sort(dim=-1)
+    N = pr.shape[-1]
+    index = torch.arange(1, N + 1, device=pr.device).float()
+    return (2 * (index * sorted_pr).sum(dim=-1) / (N * sorted_pr.sum(dim=-1)) - (N + 1) / N).mean().item()
+def get_curriculum_params(step, total_steps, curriculum):
+    """Get current edge_prob and graph_weights from curriculum."""
+    frac = step / max(1, total_steps)
+    for i in range(len(curriculum) - 1):
+        f0, p0, w0 = curriculum[i]
+        f1, p1, w1 = curriculum[i + 1]
+        if f0 <= frac <= f1:
+            if f1 == f0:
+                return p0, w0
+            t = (frac - f0) / (f1 - f0)
+            p = p0 + t * (p1 - p0)
+            w = tuple(a + t * (b - a) for a, b in zip(w0, w1))
+            return p, w
+    return curriculum[-1][1], curriculum[-1][2]
+# ---------------------------------------------------------------------------
+# Training
+# ---------------------------------------------------------------------------
+def train(config, args):
+    device = args.device
+    if device == "cuda":
+        torch.set_float32_matmul_precision('high')
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    model = IterativePageRankModel(config).to(device)
+    n_params = sum(p.numel() for p in model.parameters())
+    print(f"Model params: {n_params:,} ({n_params/1e6:.2f}M)")
+    print(f"Config: {config.n_layers}L, d={config.d_model}, h={config.n_heads}, "
+          f"ff={config.d_ff}, iters={config.train_iters}, N={config.n_nodes}")
+    print(f"Damping: {config.damping}")
+    print(f"Device: {device}")
+    print()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.95), weight_decay=0.01)
+    def lr_schedule(step):
+        if step < args.warmup:
+            return step / args.warmup
+        progress = (step - args.warmup) / max(1, args.steps - args.warmup)
+        return 0.01 + 0.99 * 0.5 * (1 + math.cos(math.pi * progress))
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_schedule)
+    if args.compile and device == "cuda":
+        print("Compiling transformer step...")
+        model._transformer_step = torch.compile(model._transformer_step)
+        print("Compile done.")
+    use_amp = device == "cuda"
+    scaler = torch.amp.GradScaler('cuda', enabled=use_amp)
+    autocast_ctx = torch.amp.autocast('cuda', dtype=torch.bfloat16) if use_amp else nullcontext()
+    # Pre-generate graph pool for fast sampling during training
+    pool_size = min(args.steps * args.batch, 100_000)  # cap at 100K graphs
+    print(f"Pre-generating {pool_size:,} graphs...")
+    edge_prob, graph_weights = get_curriculum_params(0, args.steps, config.curriculum)
+    pool_adj, pool_targets, pool_meta = generate_batch(pool_size, config, edge_prob, graph_weights, device)
+    print(f"Pool ready. Gini={pool_meta['gini']:.2f}, max_pr={pool_meta['max_pr']:.4f}")
+    t0 = time.time()
+    for step in range(args.steps + 1):
+        model.train()
+        # Sample batch from pre-generated pool
+        idx = torch.randint(0, pool_size, (args.batch,), device=device)
+        adj = pool_adj[idx]
+        targets = pool_targets[idx]
+        meta = pool_meta
+        with autocast_ctx:
+            all_prs = model(adj)
+            # Loss: KL divergence at every iteration (intermediate supervision)
+            # targets is the true PR distribution, all_prs[i] is predicted distribution
+            loss = 0.0
+            for pr_pred in all_prs:
+                # KL(target || pred) = sum(target * log(target/pred))
+                loss += F.kl_div(
+                    (pr_pred + 1e-10).log(),
+                    targets,
+                    reduction='batchmean',
+                )
+            loss /= len(all_prs)
+        optimizer.zero_grad()
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        scaler.step(optimizer)
+        scaler.update()
+        scheduler.step()
+        if step % args.log_interval == 0:
+            elapsed = time.time() - t0
+            with torch.no_grad():
+                final_pr = all_prs[-1]
+                mse = ((final_pr - targets) ** 2).mean().item()
+                # Ranking accuracy: Kendall tau correlation
+                rank_acc = _ranking_accuracy(final_pr, targets)
+                # Top-5 accuracy
+                top5_acc = _topk_accuracy(final_pr, targets, k=5)
+            print(f"Step {step:5d} | KL: {loss.item():.4f} | MSE: {mse:.6f} | "
+                  f"Rank: {rank_acc:.1%} | Top5: {top5_acc:.1%} | "
+                  f"Gini: {meta['gini']:.2f} | {elapsed:.1f}s")
+        if step > 0 and step % args.eval_interval == 0:
+            evaluate(model, config, device, args.eval_batch)
+    print("\n" + "=" * 70)
+    print("FINAL EVALUATION")
+    print("=" * 70)
+    evaluate(model, config, device, args.eval_batch, verbose=True)
+    if args.save_path:
+        save_checkpoint(model, config, args)
+    return model
+def _ranking_accuracy(pred_pr, true_pr):
+    """Fraction of pairwise orderings that match."""
+    pred_rank = pred_pr.argsort(dim=-1, descending=True).argsort(dim=-1)
+    true_rank = true_pr.argsort(dim=-1, descending=True).argsort(dim=-1)
+    # Pairwise concordance (simplified Kendall tau)
+    B, N = pred_pr.shape
+    correct = 0
+    total = 0
+    # Sample pairs for efficiency
+    n_pairs = min(100, N * (N - 1) // 2)
+    for _ in range(n_pairs):
+        i, j = torch.randint(0, N, (2,))
+        if i == j:
+            continue
+        pred_order = pred_pr[:, i] > pred_pr[:, j]
+        true_order = true_pr[:, i] > true_pr[:, j]
+        correct += (pred_order == true_order).float().sum().item()
+        total += B
+    return correct / max(1, total)
+def _topk_accuracy(pred_pr, true_pr, k=5):
+    """Fraction of true top-k nodes that appear in predicted top-k."""
+    pred_topk = pred_pr.topk(k, dim=-1).indices  # (B, k)
+    true_topk = true_pr.topk(k, dim=-1).indices  # (B, k)
+    # Check overlap
+    hits = 0
+    for i in range(k):
+        hits += (pred_topk == true_topk[:, i:i+1]).any(dim=-1).float().sum().item()
+    return hits / (pred_pr.shape[0] * k)
+def evaluate(model, config, device, eval_batch=1024, verbose=False):
+    """Evaluate across graph types and iteration counts."""
+    model.eval()
+    graph_configs = [
+        ("Random p=0.15", 0.15, (1.0, 0.0, 0.0)),
+        ("Preferential", 0.10, (0.0, 1.0, 0.0)),
+        ("Hub-spoke", 0.10, (0.0, 0.0, 1.0)),
+        ("Mixed (hard)", 0.08, (0.2, 0.5, 0.3)),
+    ]
+    iter_counts = [config.train_iters, 32, 64, 128, 256]
+    for name, ep, gw in graph_configs:
+        adj, targets, meta = generate_batch(eval_batch, config, ep, gw, device)
+        print(f"\n  {name} (gini={meta['gini']:.2f}, max_pr={meta['max_pr']:.4f})")
+        print(f"  {'Iters':>6s} | {'KL':>8s} | {'MSE':>10s} | {'Rank':>6s} | {'Top5':>6s}")
+        print(f"  {'-'*6} | {'-'*8} | {'-'*10} | {'-'*6} | {'-'*6}")
+        for n_iters in iter_counts:
+            with torch.no_grad():
+                all_prs = model(adj, n_iters=n_iters)
+                final_pr = all_prs[-1]
+                kl = F.kl_div((final_pr + 1e-10).log(), targets, reduction='batchmean').item()
+                mse = ((final_pr - targets) ** 2).mean().item()
+                rank_acc = _ranking_accuracy(final_pr, targets)
+                top5_acc = _topk_accuracy(final_pr, targets, k=5)
+            print(f"  {n_iters:6d} | {kl:8.4f} | {mse:10.6f} | {rank_acc:5.1%} | {top5_acc:5.1%}")
+    if verbose:
+        # Show examples from hub-spoke (most non-uniform PR)
+        adj, targets, _ = generate_batch(4, config, 0.10, (0.0, 0.0, 1.0), device)
+        with torch.no_grad():
+            all_prs = model(adj, n_iters=256)
+            final_pr = all_prs[-1]
+        print(f"\n  Sample predictions (hub-spoke, 256 iters):")
+        for i in range(min(4, len(adj))):
+            true_topk = targets[i].topk(5)
+            pred_topk = final_pr[i].topk(5)
+            true_str = ", ".join(f"n{idx}={val:.3f}" for val, idx in zip(true_topk.values, true_topk.indices))
+            pred_str = ", ".join(f"n{idx}={val:.3f}" for val, idx in zip(pred_topk.values, pred_topk.indices))
+            mse_i = ((final_pr[i] - targets[i]) ** 2).mean().item()
+            print(f"    MSE={mse_i:.6f}")
+            print(f"      True top5: {true_str}")
+            print(f"      Pred top5: {pred_str}")
+def save_checkpoint(model, config, args):
+    import json, os, tempfile
+    raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
+    checkpoint = {
+        "model_state_dict": raw_model.state_dict(),
+        "config": {
+            "n_nodes": config.n_nodes,
+            "d_model": config.d_model,
+            "n_heads": config.n_heads,
+            "n_layers": config.n_layers,
+            "d_ff": config.d_ff,
+            "dropout": config.dropout,
+            "train_iters": config.train_iters,
+            "rope_base": config.rope_base,
+            "damping": config.damping,
+        },
+    }
+    torch.save(checkpoint, args.save_path)
+    print(f"\nCheckpoint saved to {args.save_path}")
+    if args.upload_hf:
+        from huggingface_hub import HfApi
+        api = HfApi()
+        try:
+            api.create_repo(args.upload_hf, exist_ok=True)
+        except Exception as e:
+            print(f"Warning: {e}")
+        api.upload_file(path_or_fileobj=args.save_path, path_in_repo="model.pt", repo_id=args.upload_hf)
+        api.upload_file(path_or_fileobj=os.path.abspath(__file__), path_in_repo="iterative_pagerank.py", repo_id=args.upload_hf)
+        config_json = json.dumps(checkpoint["config"], indent=2)
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            f.write(config_json)
+            cfg_path = f.name
+        api.upload_file(path_or_fileobj=cfg_path, path_in_repo="config.json", repo_id=args.upload_hf)
+        os.unlink(cfg_path)
+        n_params = sum(p.numel() for p in raw_model.parameters())
+        card = f"""# Iterative PageRank Model
+Non-autoregressive iterative transformer that learns PageRank via shared-weight refinement.
+## Architecture
+- **Params:** {n_params:,}
+- **Layers:** {config.n_layers} (shared across {config.train_iters} iterations)
+- **Width:** {config.d_model}, **Heads:** {config.n_heads}
+- **Graph size:** {config.n_nodes} nodes (directed)
+- **Damping:** {config.damping}
+## Task
+Given a directed graph's adjacency matrix, predict each node's PageRank value.
+The model learns power iteration implicitly through iterative refinement.
+## Metrics
+- **KL divergence:** between predicted and true PR distribution
+- **Ranking accuracy:** pairwise ordering correctness
+- **Top-k accuracy:** overlap of predicted vs true top-k important nodes
+## Usage
+```python
+import torch
+from iterative_pagerank import IterativePageRankModel, PageRankConfig, generate_batch
+ckpt = torch.load("model.pt", weights_only=True)
+config = PageRankConfig(**ckpt["config"])
+model = IterativePageRankModel(config)
+model.load_state_dict(ckpt["model_state_dict"])
+model.eval()
+adj, targets, meta = generate_batch(1, config, edge_prob=0.1,
+    graph_weights=(0.0, 0.0, 1.0), device="cpu")
+with torch.no_grad():
+    all_prs = model(adj, n_iters=64)
+    print(f"Predicted: {{all_prs[-1][0].topk(5)}}")
+    print(f"True:      {{targets[0].topk(5)}}")
+```
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(card)
+            card_path = f.name
+        api.upload_file(path_or_fileobj=card_path, path_in_repo="README.md", repo_id=args.upload_hf)
+        os.unlink(card_path)
+        print(f"Uploaded to https://huggingface.co/{args.upload_hf}")
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description="Iterative PageRank model")
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--steps", type=int, default=50000)
+    parser.add_argument("--batch", type=int, default=2048)
+    parser.add_argument("--eval-batch", type=int, default=1024)
+    parser.add_argument("--lr", type=float, default=2e-3)
+    parser.add_argument("--warmup", type=int, default=1400)
+    parser.add_argument("--log-interval", type=int, default=100)
+    parser.add_argument("--eval-interval", type=int, default=5000)
+    parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--save-path", type=str, default=None)
+    parser.add_argument("--upload-hf", type=str, default=None)
+    parser.add_argument("--d-model", type=int, default=128)
+    parser.add_argument("--n-layers", type=int, default=4)
+    parser.add_argument("--n-heads", type=int, default=4)
+    parser.add_argument("--d-ff", type=int, default=512)
+    parser.add_argument("--train-iters", type=int, default=16)
+    parser.add_argument("--n-nodes", type=int, default=64)
+    parser.add_argument("--damping", type=float, default=0.85)
+    parser.add_argument("--dropout", type=float, default=0.1)
+    args = parser.parse_args()
+    config = PageRankConfig(
+        n_nodes=args.n_nodes,
+        d_model=args.d_model,
+        n_heads=args.n_heads,
+        n_layers=args.n_layers,
+        d_ff=args.d_ff,
+        dropout=args.dropout,
+        train_iters=args.train_iters,
+        damping=args.damping,
+    )
+    train(config, args)
+if __name__ == "__main__":
+    main()