Spaces:

2cu1001
/

PR-IQA

Running on Zero

+"""
+Dataset for PR-IQA training.
+Expected directory structure per scene::
+    s000/
+    ├── total/                    # Original keyframe images (RGB)
+    │   ├── 0000.jpg
+    │   ├── 0001.jpg
+    │   └── ...
+    ├── tgt_diffusion/            # Generated images per target
+    │   └── 0005/
+    │       ├── 0005_diff_0.jpg
+    │       └── ...
+    ├── total_map/                # Full quality maps (GT, grayscale)
+    │   └── 0005/
+    │       ├── 0005_diff_0.png
+    │       └── ...
+    ├── partial_map/              # Partial quality maps (from FeatureMetric)
+    │   └── 0005/
+    │       ├── 0005_diff_0_ref+10_0015.png
+    │       └── ...
+    └── partial_mask/             # Overlap masks
+        └── 0005/
+            ├── 0005_diff_0_ref+10_0015.png
+            └── ...
+Each sample is a tuple: (tgt, tgt_diff, full_map, partial_map, partial_mask, current_ref).
+"""
+import random
+from pathlib import Path
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+import torchvision.transforms.functional as TF
+class SceneDataset(Dataset):
+    """Dataset that enumerates all valid (tgt, diff, ref, partial_map, mask) combinations."""
+    def __init__(self, root_dir, rgb_transform=None, grayscale_transform=None, training=True):
+        self.root_dir = Path(root_dir)
+        self.rgb_transform = rgb_transform
+        self.grayscale_transform = grayscale_transform
+        self.samples = []
+        self.ref_deltas = [-20, -10, 10, 20]
+        self.training = training
+        for scene_path in sorted(self.root_dir.glob("s*")):
+            if not scene_path.is_dir():
+                continue
+            total_dir = scene_path / "total"
+            if not total_dir.is_dir():
+                continue
+            total_images = sorted(total_dir.glob("*.jpg"), key=lambda p: int(p.stem))
+            num_total = len(total_images)
+            if num_total == 0:
+                continue
+            for i, tgt_path in enumerate(total_images):
+                tgt_stem = tgt_path.stem
+                # Find reference images at fixed offsets
+                ref_info_list = []
+                complete = True
+                for d in self.ref_deltas:
+                    ref_idx = (i + d) % num_total
+                    ref_path = total_images[ref_idx]
+                    if not ref_path.exists():
+                        complete = False
+                        break
+                    ref_info_list.append({"path": ref_path, "offset": d})
+                if not complete:
+                    continue
+                tgt_diff_dir = scene_path / "tgt_diffusion" / tgt_stem
+                total_map_dir = scene_path / "total_map" / tgt_stem
+                for tgt_diff_path in sorted(tgt_diff_dir.glob("*_diff_*.jpg")):
+                    full_map_path = total_map_dir / f"{tgt_diff_path.stem}.png"
+                    if not full_map_path.exists():
+                        continue
+                    tgt_diff_stem = tgt_diff_path.stem
+                    for ref_info in ref_info_list:
+                        ref_path = ref_info["path"]
+                        ref_stem = ref_path.stem
+                        d = ref_info["offset"]
+                        mask_path = (
+                            scene_path / "partial_mask" / tgt_stem
+                            / f"{tgt_diff_stem}_ref{d:+d}_{ref_stem}.png"
+                        )
+                        map_path = (
+                            scene_path / "partial_map" / tgt_stem
+                            / f"{tgt_diff_stem}_ref{d:+d}_{ref_stem}.png"
+                        )
+                        if mask_path.exists() and map_path.exists():
+                            self.samples.append({
+                                "tgt": tgt_path,
+                                "tgt_diff": tgt_diff_path,
+                                "full_map": full_map_path,
+                                "partial_mask": mask_path,
+                                "partial_map": map_path,
+                                "current_ref": ref_path,
+                            })
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        paths = self.samples[idx]
+        tgt_img = Image.open(paths["tgt"]).convert("RGB")
+        tgt_diff_img = Image.open(paths["tgt_diff"]).convert("RGB")
+        full_map_img = Image.open(paths["full_map"]).convert("L")
+        partial_mask_img = Image.open(paths["partial_mask"]).convert("L")
+        partial_map_img = Image.open(paths["partial_map"]).convert("L")
+        cur_ref_img = Image.open(paths["current_ref"]).convert("RGB")
+        # -- Augmentation (training only) --
+        if self.training:
+            if random.random() > 0.5:
+                tgt_img = TF.hflip(tgt_img)
+                tgt_diff_img = TF.hflip(tgt_diff_img)
+                cur_ref_img = TF.hflip(cur_ref_img)
+                full_map_img = TF.hflip(full_map_img)
+                partial_mask_img = TF.hflip(partial_mask_img)
+                partial_map_img = TF.hflip(partial_map_img)
+            if random.random() > 0.7:
+                tgt_img = TF.vflip(tgt_img)
+                tgt_diff_img = TF.vflip(tgt_diff_img)
+                cur_ref_img = TF.vflip(cur_ref_img)
+                full_map_img = TF.vflip(full_map_img)
+                partial_mask_img = TF.vflip(partial_mask_img)
+                partial_map_img = TF.vflip(partial_map_img)
+            if random.random() > 0.5:
+                brightness = random.uniform(0.9, 1.1)
+                contrast = random.uniform(0.9, 1.1)
+                saturation = random.uniform(0.9, 1.1)
+                for fn in [TF.adjust_brightness, TF.adjust_contrast, TF.adjust_saturation]:
+                    val = brightness if fn == TF.adjust_brightness else (
+                        contrast if fn == TF.adjust_contrast else saturation
+                    )
+                    tgt_img = fn(tgt_img, val)
+                    tgt_diff_img = fn(tgt_diff_img, val)
+                    cur_ref_img = fn(cur_ref_img, val)
+        if self.rgb_transform:
+            tgt_img, tgt_diff_img, cur_ref_img = map(
+                self.rgb_transform, [tgt_img, tgt_diff_img, cur_ref_img]
+            )
+        if self.grayscale_transform:
+            full_map_img, partial_mask_img, partial_map_img = map(
+                self.grayscale_transform, [full_map_img, partial_mask_img, partial_map_img]
+            )
+        return {
+            "tgt": tgt_img,
+            "tgt_diff": tgt_diff_img,
+            "partial_mask": partial_mask_img,
+            "partial_map": partial_map_img,
+            "full_map": full_map_img,
+            "current_ref": cur_ref_img,
+        }

pr_iqa/loss.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Loss functions for PR-IQA training.
+Core losses:
+  - JSD (Jensen-Shannon Divergence): Distribution matching
+  - Masked L1: Pixel-wise L1 on partial map regions
+  - Pearson: Correlation-based structural loss
+Additional losses (optional):
+  - Ranking: Pairwise ranking consistency
+  - Global mean/std: Statistics matching
+"""
+import torch
+import torch.nn.functional as F
+def loss_jsd(pred, target, tau=0.2, reduction="mean", eps=1e-6):
+    """Jensen-Shannon Divergence loss.
+    Converts pixel maps to probability distributions via softmax over logits,
+    then computes symmetric KL divergence.
+    """
+    with torch.autocast(device_type="cuda", enabled=False):
+        p = pred.float().clamp(min=eps, max=1 - eps)
+        y = target.float().clamp(min=eps, max=1 - eps)
+        p_logit = torch.logit(p, eps=eps) / tau
+        y_logit = torch.logit(y, eps=eps) / tau
+        q_hat = torch.softmax(p_logit.flatten(start_dim=1), dim=1)
+        q = torch.softmax(y_logit.flatten(start_dim=1), dim=1)
+        m = 0.5 * (q + q_hat)
+        def _kl(a, b):
+            return torch.sum(a * (torch.log(a + eps) - torch.log(b + eps)), dim=1)
+        jsd_per = 0.5 * (_kl(q, m) + _kl(q_hat, m))
+        if reduction == "mean":
+            return jsd_per.mean().to(pred.dtype)
+        elif reduction == "sum":
+            return jsd_per.sum().to(pred.dtype)
+        return jsd_per.to(pred.dtype)
+def loss_masked_l1(pred, target, mask, reduction="mean"):
+    """L1 loss masked to partial map regions."""
+    l = torch.abs(pred - target)
+    masked = l * mask
+    if reduction == "mean":
+        return masked.sum() / (mask.sum() + 1e-8)
+    elif reduction == "sum":
+        return masked.sum()
+    return masked
+def loss_l1(pred, target, reduction="mean"):
+    """Standard L1 loss."""
+    l = (pred - target).abs()
+    if reduction == "mean":
+        return l.mean().to(pred.dtype)
+    elif reduction == "sum":
+        return l.sum().to(pred.dtype)
+    return l.to(pred.dtype)
+def loss_pearson(pred, target, reduction="mean", eps=1e-6):
+    """1 - Pearson correlation coefficient."""
+    x = pred.float().reshape(pred.shape[0], -1).contiguous()
+    y = target.float().reshape(target.shape[0], -1).contiguous()
+    mx = x.mean(dim=1)
+    my = y.mean(dim=1)
+    x = x - mx[:, None]
+    y = y - my[:, None]
+    xx = (x * x).sum(dim=1)
+    yy = (y * y).sum(dim=1)
+    denom = torch.sqrt(xx * yy + eps)
+    rho = ((x * y).sum(dim=1) / denom).clamp(-1.0, 1.0)
+    loss = 1.0 - rho
+    if reduction == "mean":
+        return loss.mean().to(pred.dtype)
+    elif reduction == "sum":
+        return loss.sum().to(pred.dtype)
+    return loss.to(pred.dtype)
+def loss_ranking(pred, gt, margin=0.1):
+    """Pairwise ranking loss for relative quality ordering."""
+    B, C, H, W = pred.shape
+    pred_flat = pred.view(B, -1)
+    gt_flat = gt.view(B, -1)
+    n = int(H * W * 0.5)
+    idx1 = torch.randint(0, H * W, (B, n), device=pred.device)
+    idx2 = torch.randint(0, H * W, (B, n), device=pred.device)
+    pred1 = pred_flat.gather(1, idx1)
+    pred2 = pred_flat.gather(1, idx2)
+    gt1 = gt_flat.gather(1, idx1)
+    gt2 = gt_flat.gather(1, idx2)
+    target = torch.sign(gt1 - gt2)
+    return F.margin_ranking_loss(pred1, pred2, target, margin=margin)

pr_iqa/model/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from .priqa import PRIQA
+from .layers import (
+    PartialConv2d,
+    GatedPartialEmb,
+    GatedEmb,
+    FeedForward,
+    ChannelGate,
+    Attention,
+    TransformerLikeBlock,
+    SandwichBlock,
+    Downsample,
+    Upsample,
+    Pos2d,
+    DropPath,
+    LayerNorm,
+)
+def build_priqa(
+    out_channels: int = 1,
+    dim: int = 48,
+    num_blocks: tuple = (2, 3, 3, 4),
+    heads: tuple = (1, 2, 4, 8),
+    ffn_expansion_factor: float = 2.66,
+    bias: bool = False,
+    layernorm_type: str = "WithBias",
+    use_partial_conv: bool = True,
+) -> PRIQA:
+    """Build a PR-IQA model with default or custom hyperparameters."""
+    return PRIQA(
+        inp_channels=4,
+        out_channels=out_channels,
+        dim=dim,
+        num_blocks=list(num_blocks),
+        heads=list(heads),
+        ffn_expansion_factor=ffn_expansion_factor,
+        bias=bias,
+        LayerNorm_type=layernorm_type,
+        use_partial_conv=use_partial_conv,
+    )
+__all__ = ["PRIQA", "build_priqa"]

pr_iqa/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.18 kB). View file

pr_iqa/model/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (1.13 kB). View file

pr_iqa/model/__pycache__/layers.cpython-310.pyc ADDED Viewed

Binary file (13.9 kB). View file

pr_iqa/model/__pycache__/layers.cpython-38.pyc ADDED Viewed

Binary file (14.2 kB). View file

pr_iqa/model/__pycache__/priqa.cpython-310.pyc ADDED Viewed

Binary file (6.95 kB). View file

pr_iqa/model/__pycache__/priqa.cpython-38.pyc ADDED Viewed

Binary file (6.94 kB). View file

pr_iqa/model/layers.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""
+Building blocks for the PR-IQA architecture.
+Includes:
+  - PartialConv2d: Mask-aware convolution for inpainting
+  - GatedPartialEmb / GatedEmb: Gated patch embeddings
+  - FeedForward (FFN): Gated depth-wise separable FFN
+  - ChannelGate: SE/CBAM-style channel attention
+  - Attention: Spatial attention with xformers memory-efficient attention
+  - TransformerLikeBlock: Channel gate → Spatial attn → FFN with residuals
+  - SandwichBlock: FFN → Channel gate → Spatial attn → FFN
+  - Downsample / Upsample: Strided conv / PixelShuffle
+  - Pos2d: 2D sinusoidal positional encoding
+  - DropPath: Stochastic depth
+  - LayerNorm: Bias-free or with-bias layer normalization
+"""
+import numbers
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from xformers import ops
+# ---------------------------------------------------------------------------
+# Layer Normalization
+# ---------------------------------------------------------------------------
+def to_3d(x):
+    return rearrange(x, "b c h w -> b (h w) c")
+def to_4d(x, h, w):
+    return rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma + 1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma + 1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    def __init__(self, dim, LayerNorm_type="WithBias"):
+        super().__init__()
+        if LayerNorm_type == "BiasFree":
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)
+# ---------------------------------------------------------------------------
+# Partial Convolution
+# ---------------------------------------------------------------------------
+class PartialConv2d(nn.Module):
+    """Mask-aware convolution for inpainting.
+    Given input ``x`` and binary mask ``mask`` (1 = valid), the output is
+    normalized by the number of valid pixels in each receptive field.
+    """
+    def __init__(self, in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True):
+        super().__init__()
+        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, stride, padding, bias=False)
+        self.mask_conv = nn.Conv2d(1, out_ch, kernel_size, stride, padding, bias=False)
+        nn.init.constant_(self.mask_conv.weight, 1.0)
+        self.mask_conv.weight.requires_grad = False
+        self.bias = nn.Parameter(torch.zeros(out_ch)) if bias else None
+    def forward(self, x, mask):
+        with torch.no_grad():
+            mask_sum = self.mask_conv(mask).clamp(min=1e-8)
+            new_mask = (mask_sum > 0).float()
+        output = self.conv(x * mask) / mask_sum
+        if self.bias is not None:
+            output = output + self.bias.view(1, -1, 1, 1)
+        output = output * new_mask
+        return output, new_mask[:, 0:1]
+# ---------------------------------------------------------------------------
+# Gated Embeddings
+# ---------------------------------------------------------------------------
+class GatedPartialEmb(nn.Module):
+    """Gated patch embedding using PartialConv2d (for masked inputs)."""
+    def __init__(self, in_c=4, embed_dim=48, bias=False):
+        super().__init__()
+        self.pconv = PartialConv2d(in_c, embed_dim * 2, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, x_with_mask, mask):
+        """
+        Args:
+            x_with_mask: (B, in_c, H, W) — e.g. RGB(3) + mask(1) concatenated.
+            mask: (B, 1, H, W) — binary mask for partial conv.
+        """
+        x, mask_out = self.pconv(x_with_mask, mask)
+        x1, x2 = x.chunk(2, dim=1)
+        x = F.gelu(x1) * x2
+        return x, mask_out
+class GatedEmb(nn.Module):
+    """Gated patch embedding (standard, no partial conv)."""
+    def __init__(self, in_c=3, embed_dim=48, bias=False):
+        super().__init__()
+        self.gproj1 = nn.Conv2d(in_c, embed_dim * 2, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, x):
+        x = self.gproj1(x)
+        x1, x2 = x.chunk(2, dim=1)
+        return F.gelu(x1) * x2
+# ---------------------------------------------------------------------------
+# Feed-Forward Network
+# ---------------------------------------------------------------------------
+class FeedForward(nn.Module):
+    """Gated depth-wise separable FFN."""
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super().__init__()
+        hidden_features = int(dim * ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features * 2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(
+            hidden_features * 2, hidden_features * 2,
+            kernel_size=3, stride=1, padding=1,
+            groups=hidden_features * 2, bias=bias,
+        )
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        x = self.project_in(x)
+        x1, x2 = self.dwconv(x).chunk(2, dim=1)
+        x = F.gelu(x1) * x2
+        return self.project_out(x)
+# ---------------------------------------------------------------------------
+# Channel Attention
+# ---------------------------------------------------------------------------
+class ChannelGate(nn.Module):
+    """SE/CBAM-style channel gate."""
+    def __init__(self, dim, reduction=16, use_max=True, bias=True):
+        super().__init__()
+        hidden = max(1, dim // reduction)
+        self.mlp = nn.Sequential(
+            nn.Conv2d(dim, hidden, 1, bias=bias),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(hidden, dim, 1, bias=bias),
+        )
+        self.use_max = use_max
+    def _pooled(self, t):
+        avg = F.adaptive_avg_pool2d(t, 1)
+        if self.use_max:
+            mx = F.adaptive_max_pool2d(t, 1)
+            pooled = avg + mx
+        else:
+            pooled = avg
+        return self.mlp(pooled)
+    def forward(self, x, kv=None):
+        gate_logits = self._pooled(x) if kv is None else (self._pooled(x) + self._pooled(kv))
+        gate = torch.sigmoid(gate_logits)
+        x_gated = x * gate
+        kv_gated = kv * gate if kv is not None else None
+        return x_gated, kv_gated
+# ---------------------------------------------------------------------------
+# Spatial Attention (xformers)
+# ---------------------------------------------------------------------------
+class Attention(nn.Module):
+    """Spatial attention with xformers memory-efficient attention.
+    Supports both self-attention (kv=None) and cross-attention (kv provided).
+    Includes a spatial gating branch.
+    """
+    def __init__(self, dim, num_heads, bias):
+        super().__init__()
+        self.num_heads = num_heads
+        # Self-attention projections
+        self.qkv = nn.Conv2d(dim, dim * 3, kernel_size=1, bias=bias)
+        self.qkv_dwconv = nn.Conv2d(
+            dim * 3, dim * 3, kernel_size=3, stride=1, padding=1,
+            groups=dim * 3, bias=bias,
+        )
+        # Cross-attention projections
+        self.q_proj_qonly = nn.Conv2d(dim, dim, kernel_size=1, bias=bias)
+        self.q_dw_qonly = nn.Conv2d(dim, dim, kernel_size=3, stride=1, padding=1, groups=dim, bias=bias)
+        self.kv_proj_cross = nn.Conv2d(dim, dim * 2, kernel_size=1, bias=bias)
+        self.kv_dwconv_cross = nn.Conv2d(
+            dim * 2, dim * 2, kernel_size=3, stride=1, padding=1,
+            groups=dim * 2, bias=bias,
+        )
+        self.project_out = nn.Conv2d(dim, dim, kernel_size=1, bias=bias)
+        # Spatial gating
+        self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.upsample_to = lambda t, size: F.interpolate(t, size=size, mode="bilinear", align_corners=False)
+        self.conv = nn.Sequential(
+            nn.Conv2d(dim, dim, kernel_size=3, stride=1, padding=1, bias=True),
+            LayerNorm(dim, "WithBias"),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(dim, dim, kernel_size=3, stride=1, padding=1, bias=True),
+            LayerNorm(dim, "WithBias"),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x, kv=None):
+        b, c, h, w = x.shape
+        head_dim = c // self.num_heads
+        if kv is None:
+            qkv = self.qkv_dwconv(self.qkv(x))
+            q, k, v = qkv.chunk(3, dim=1)
+        else:
+            q = self.q_dw_qonly(self.q_proj_qonly(x))
+            kv_feat = self.kv_dwconv_cross(self.kv_proj_cross(kv))
+            k, v = kv_feat.chunk(2, dim=1)
+        q = q.view(b, self.num_heads, head_dim, h * w).permute(0, 3, 1, 2).contiguous()
+        k = k.view(b, self.num_heads, head_dim, -1).permute(0, 3, 1, 2).contiguous()
+        v = v.view(b, self.num_heads, head_dim, -1).permute(0, 3, 1, 2).contiguous()
+        out = ops.memory_efficient_attention(q, k, v)
+        out = out.permute(0, 2, 3, 1).reshape(b, c, h, w)
+        # Spatial gating
+        spatial_weight = self.avg_pool(x)
+        spatial_weight = self.conv(spatial_weight)
+        spatial_weight = self.upsample_to(spatial_weight, (h, w))
+        out = out * spatial_weight
+        return self.project_out(out)
+# ---------------------------------------------------------------------------
+# Drop Path (Stochastic Depth)
+# ---------------------------------------------------------------------------
+class DropPath(nn.Module):
+    def __init__(self, p: float = 0.0):
+        super().__init__()
+        self.p = float(p)
+    def forward(self, x):
+        if self.p == 0.0 or not self.training:
+            return x
+        keep = 1.0 - self.p
+        mask = torch.rand(x.shape[0], 1, 1, 1, device=x.device, dtype=x.dtype) < keep
+        return x * mask / keep
+# ---------------------------------------------------------------------------
+# Transformer-like Block
+# ---------------------------------------------------------------------------
+class TransformerLikeBlock(nn.Module):
+    """Channel gate → Spatial attention → FFN with layer scale and residuals."""
+    def __init__(self, dim, num_heads, ffn_expansion_factor, bias, LayerNorm_type,
+                 drop_path=0.0, layerscale_init=1e-2):
+        super().__init__()
+        self.norm_c = LayerNorm(dim, LayerNorm_type)
+        self.chan = ChannelGate(dim, reduction=16, use_max=True, bias=bias)
+        self.norm_s = LayerNorm(dim, LayerNorm_type)
+        self.sattn = Attention(dim, num_heads, bias)
+        self.norm_f = LayerNorm(dim, LayerNorm_type)
+        self.ffn = FeedForward(dim, ffn_expansion_factor, bias)
+        self.gamma_c = nn.Parameter(torch.ones(1, dim, 1, 1) * layerscale_init)
+        self.gamma_s = nn.Parameter(torch.ones(1, dim, 1, 1) * layerscale_init)
+        self.gamma_f = nn.Parameter(torch.ones(1, dim, 1, 1) * layerscale_init)
+        self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()
+    def forward(self, x, kv=None):
+        xc = self.norm_c(x)
+        xc_gated, kv_gated = self.chan(xc, kv)
+        x = x + self.drop_path(self.gamma_c * xc_gated)
+        xs = self.norm_s(x)
+        xs = self.sattn(xs, kv_gated if kv is not None else None)
+        x = x + self.drop_path(self.gamma_s * xs)
+        xf = self.norm_f(x)
+        xf = self.ffn(xf)
+        x = x + self.drop_path(self.gamma_f * xf)
+        return x
+# ---------------------------------------------------------------------------
+# Sandwich Block
+# ---------------------------------------------------------------------------
+class SandwichBlock(nn.Module):
+    """FFN → Channel gate → Spatial attn → FFN."""
+    def __init__(self, dim, num_heads, ffn_expansion_factor, bias, LayerNorm_type):
+        super().__init__()
+        self.norm1_1 = LayerNorm(dim, LayerNorm_type)
+        self.ffn1 = FeedForward(dim, ffn_expansion_factor, bias)
+        self.norm_c = LayerNorm(dim, LayerNorm_type)
+        self.chan = ChannelGate(dim, reduction=16, use_max=True, bias=bias)
+        self.norm1 = LayerNorm(dim, LayerNorm_type)
+        self.attn = Attention(dim, num_heads, bias)
+        self.norm2 = LayerNorm(dim, LayerNorm_type)
+        self.ffn = FeedForward(dim, ffn_expansion_factor, bias)
+    def forward(self, x, kv=None):
+        x = x + self.ffn1(self.norm1_1(x))
+        xc = self.norm_c(x)
+        xc_gated, kv_gated = self.chan(xc, kv)
+        x = x + xc_gated
+        x = x + self.attn(self.norm1(x), kv_gated if kv is not None else None)
+        x = x + self.ffn(self.norm2(x))
+        return x
+# ---------------------------------------------------------------------------
+# Downsample / Upsample
+# ---------------------------------------------------------------------------
+class Downsample(nn.Module):
+    def __init__(self, n_feat):
+        super().__init__()
+        self.body = nn.Sequential(
+            nn.Conv2d(n_feat, n_feat * 2, kernel_size=3, stride=2, padding=1, bias=False),
+        )
+    def forward(self, x, mask=None):
+        return self.body(x)
+class Upsample(nn.Module):
+    def __init__(self, n_feat):
+        super().__init__()
+        self.body = nn.Sequential(
+            nn.Conv2d(n_feat, n_feat * 2, kernel_size=3, stride=1, padding=1, bias=False),
+            nn.PixelShuffle(2),
+        )
+    def forward(self, x, mask=None):
+        return self.body(x)
+# ---------------------------------------------------------------------------
+# Positional Encoding
+# ---------------------------------------------------------------------------
+class Pos2d(nn.Module):
+    """2D sinusoidal positional encoding."""
+    def __init__(self, dim):
+        super().__init__()
+        self.proj = nn.Conv2d(4, dim, kernel_size=1)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        device = x.device
+        yy, xx = torch.meshgrid(
+            torch.linspace(-1, 1, H, device=device),
+            torch.linspace(-1, 1, W, device=device),
+            indexing="ij",
+        )
+        pe4 = torch.stack([xx, yy, torch.sin(xx * 3.14159), torch.cos(yy * 3.14159)], dim=0)
+        pe = self.proj(pe4.unsqueeze(0)).repeat(B, 1, 1, 1)
+        return x + pe

pr_iqa/model/priqa.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+PR-IQA: Partial-Reference Image Quality Assessment model.
+3-input U-Net encoder-decoder with cross-attention:
+  - tgt_img: partial quality map (from FeatureMetric) replicated to 3ch
+  - dif_img: generated / distorted image
+  - ref_img: reference image
+Each input comes with a 4-scale mask pyramid (whole, half, quarter, tiny).
+Architecture:
+  Encoder: 4 levels (dim → 2*dim → 4*dim → 8*dim)
+    - img_encoder: shared for ref_img and dif_img (self-attention)
+    - map_encoder: for tgt_img (cross-attention with ref features)
+    - qfuse: fuses dif and tgt encoder outputs at each level
+  Decoder: 3 levels with skip connections from the dif encoder
+  Output: tanh-activated quality map
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .layers import (
+    GatedPartialEmb,
+    GatedEmb,
+    TransformerLikeBlock,
+    Downsample,
+    Upsample,
+    Pos2d,
+)
+class PRIQA(nn.Module):
+    """Partial-Reference Image Quality Assessment model.
+    Args:
+        inp_channels: Input channels per image (typically 4 = RGB + mask).
+        out_channels: Output channels (1 for quality map, 3 for RGB).
+        dim: Base feature dimension (doubles at each encoder level).
+        num_blocks: Number of TransformerLikeBlocks at each level.
+        heads: Number of attention heads at each level.
+        ffn_expansion_factor: FFN hidden dim multiplier.
+        bias: Use bias in convolutions.
+        LayerNorm_type: ``"WithBias"`` or ``"BiasFree"``.
+        use_partial_conv: Use PartialConv2d in patch embedding.
+    """
+    def __init__(
+        self,
+        inp_channels=4,
+        out_channels=3,
+        dim=48,
+        num_blocks=[4, 6, 6, 8],
+        heads=[1, 2, 4, 8],
+        ffn_expansion_factor=2.66,
+        bias=False,
+        LayerNorm_type="WithBias",
+        use_partial_conv=True,
+    ):
+        super().__init__()
+        self.use_partial_conv = use_partial_conv
+        # -- Patch embedding --
+        if use_partial_conv:
+            self.patch_embed = GatedPartialEmb(inp_channels, dim, bias)
+        else:
+            self.patch_embed = GatedEmb(inp_channels, dim, bias)
+        # -- Quality fusion (dif + tgt) at each level --
+        self.qfuse_l1 = nn.Conv2d(dim * 2, dim, kernel_size=1, bias=bias)
+        self.qfuse_l2 = nn.Conv2d(int(dim * 2 ** 1) * 2, int(dim * 2 ** 1), kernel_size=1, bias=bias)
+        self.qfuse_l3 = nn.Conv2d(int(dim * 2 ** 2) * 2, int(dim * 2 ** 2), kernel_size=1, bias=bias)
+        self.qfuse_l4 = nn.Conv2d(int(dim * 2 ** 3) * 2, int(dim * 2 ** 3), kernel_size=1, bias=bias)
+        # -- Downsampler --
+        self.down1_2 = Downsample(dim)
+        self.down2_3 = Downsample(int(dim * 2 ** 1))
+        self.down3_4 = Downsample(int(dim * 2 ** 2))
+        # -- Positional Encoding --
+        self.pos_l1 = Pos2d(dim)
+        self.pos_l2 = Pos2d(int(dim * 2 ** 1))
+        self.pos_l3 = Pos2d(int(dim * 2 ** 2))
+        self.pos_l4 = Pos2d(int(dim * 2 ** 3))
+        self.pos_d3 = Pos2d(int(dim * 2 ** 2))
+        self.pos_d2 = Pos2d(int(dim * 2 ** 1))
+        self.pos_d1 = Pos2d(int(dim * 2 ** 1))
+        # -- Encoder (img: shared for ref & dif) --
+        def _make_encoder(level_dim, n_blocks, n_heads):
+            return nn.ModuleList([
+                TransformerLikeBlock(
+                    dim=level_dim, num_heads=n_heads,
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias, LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(n_blocks)
+            ])
+        self.img_encoder_level1 = _make_encoder(dim, num_blocks[0], heads[0])
+        self.img_encoder_level2 = _make_encoder(int(dim * 2 ** 1), num_blocks[1], heads[1])
+        self.img_encoder_level3 = _make_encoder(int(dim * 2 ** 2), num_blocks[2], heads[2])
+        self.img_latent = _make_encoder(int(dim * 2 ** 3), num_blocks[3], heads[3])
+        # -- Encoder (map: for tgt, cross-attention with ref) --
+        self.map_encoder_level1 = _make_encoder(dim, num_blocks[0], heads[0])
+        self.map_encoder_level2 = _make_encoder(int(dim * 2 ** 1), num_blocks[1], heads[1])
+        self.map_encoder_level3 = _make_encoder(int(dim * 2 ** 2), num_blocks[2], heads[2])
+        self.map_latent = _make_encoder(int(dim * 2 ** 3), num_blocks[3], heads[3])
+        # -- Decoder --
+        self.up4_3 = Upsample(int(dim * 2 ** 3))
+        self.reduce_chan_level3 = nn.Conv2d(int(dim * 2 ** 3), int(dim * 2 ** 2), kernel_size=1, bias=bias)
+        self.decoder_level3 = _make_encoder(int(dim * 2 ** 2), num_blocks[2], heads[2])
+        self.up3_2 = Upsample(int(dim * 2 ** 2))
+        self.reduce_chan_level2 = nn.Conv2d(int(dim * 2 ** 2), int(dim * 2 ** 1), kernel_size=1, bias=bias)
+        self.decoder_level2 = _make_encoder(int(dim * 2 ** 1), num_blocks[1], heads[1])
+        self.up2_1 = Upsample(int(dim * 2 ** 1))
+        self.decoder_level1 = _make_encoder(int(dim * 2 ** 1), num_blocks[0], heads[0])
+        # -- Output --
+        self.output = nn.Sequential(
+            nn.Conv2d(int(dim * 2 ** 1), out_channels, kernel_size=3, stride=1, padding=1, bias=bias),
+        )
+    def forward(
+        self,
+        tgt_img, dif_img, ref_img,
+        tgt_mask_whole, tgt_mask_half, tgt_mask_quarter, tgt_mask_tiny,
+        dif_mask_whole, dif_mask_half, dif_mask_quarter, dif_mask_tiny,
+        ref_mask_whole, ref_mask_half, ref_mask_quarter, ref_mask_tiny,
+    ):
+        """
+        Args:
+            tgt_img: (B, 3, H, W) — partial quality map replicated to 3ch.
+            dif_img: (B, 3, H, W) — generated / distorted image.
+            ref_img: (B, 3, H, W) — reference image.
+            *_mask_*: (B, 1, H/s, W/s) — mask pyramids at 4 scales.
+        Returns:
+            (B, out_channels, H, W) quality map (tanh activated).
+        """
+        # -- Patch embedding --
+        if self.use_partial_conv:
+            tgt_enc_level1, _ = self.patch_embed(
+                torch.cat((tgt_img, tgt_mask_whole), dim=1), tgt_mask_whole,
+            )
+            dif_enc_level1, _ = self.patch_embed(
+                torch.cat((dif_img, dif_mask_whole), dim=1), dif_mask_whole,
+            )
+            ref_enc_level1, _ = self.patch_embed(
+                torch.cat((ref_img, ref_mask_whole), dim=1), ref_mask_whole,
+            )
+        else:
+            tgt_enc_level1 = self.patch_embed(torch.cat((tgt_img, tgt_mask_whole), dim=1))
+            dif_enc_level1 = self.patch_embed(torch.cat((dif_img, dif_mask_whole), dim=1))
+            ref_enc_level1 = self.patch_embed(torch.cat((ref_img, ref_mask_whole), dim=1))
+        tgt_enc_level1 = self.pos_l1(tgt_enc_level1)
+        dif_enc_level1 = self.pos_l1(dif_enc_level1)
+        ref_enc_level1 = self.pos_l1(ref_enc_level1)
+        # ── ENCODER Level 1 ──
+        out_ref_enc_level1 = ref_enc_level1
+        for block in self.img_encoder_level1:
+            out_ref_enc_level1 = block(out_ref_enc_level1)
+        kv_level1 = out_ref_enc_level1
+        out_tgt_enc_level1 = tgt_enc_level1
+        for block in self.map_encoder_level1:
+            out_tgt_enc_level1 = block(out_tgt_enc_level1, kv_level1)
+        out_dif_enc_level1 = dif_enc_level1
+        for block in self.img_encoder_level1:
+            out_dif_enc_level1 = block(out_dif_enc_level1, kv_level1)
+        out_dif_enc_level1 = self.qfuse_l1(torch.cat([out_dif_enc_level1, out_tgt_enc_level1], dim=1))
+        # ── ENCODER Level 2 ──
+        inp_tgt_enc_level2 = self.pos_l2(self.down1_2(out_tgt_enc_level1, tgt_mask_whole))
+        inp_dif_enc_level2 = self.pos_l2(self.down1_2(out_dif_enc_level1, dif_mask_whole))
+        inp_ref_enc_level2 = self.pos_l2(self.down1_2(out_ref_enc_level1, ref_mask_whole))
+        out_ref_enc_level2 = inp_ref_enc_level2
+        for block in self.img_encoder_level2:
+            out_ref_enc_level2 = block(out_ref_enc_level2)
+        kv_level2 = out_ref_enc_level2
+        out_tgt_enc_level2 = inp_tgt_enc_level2
+        for block in self.map_encoder_level2:
+            out_tgt_enc_level2 = block(out_tgt_enc_level2, kv_level2)
+        out_dif_enc_level2 = inp_dif_enc_level2
+        for block in self.img_encoder_level2:
+            out_dif_enc_level2 = block(out_dif_enc_level2, kv_level2)
+        out_dif_enc_level2 = self.qfuse_l2(torch.cat([out_dif_enc_level2, out_tgt_enc_level2], dim=1))
+        # ── ENCODER Level 3 ──
+        inp_tgt_enc_level3 = self.pos_l3(self.down2_3(out_tgt_enc_level2, tgt_mask_half))
+        inp_dif_enc_level3 = self.pos_l3(self.down2_3(out_dif_enc_level2, dif_mask_half))
+        inp_ref_enc_level3 = self.pos_l3(self.down2_3(out_ref_enc_level2, ref_mask_half))
+        out_ref_enc_level3 = inp_ref_enc_level3
+        for block in self.img_encoder_level3:
+            out_ref_enc_level3 = block(out_ref_enc_level3)
+        kv_level3 = out_ref_enc_level3
+        out_tgt_enc_level3 = inp_tgt_enc_level3
+        for block in self.map_encoder_level3:
+            out_tgt_enc_level3 = block(out_tgt_enc_level3, kv_level3)
+        out_dif_enc_level3 = inp_dif_enc_level3
+        for block in self.img_encoder_level3:
+            out_dif_enc_level3 = block(out_dif_enc_level3, kv_level3)
+        out_dif_enc_level3 = self.qfuse_l3(torch.cat([out_dif_enc_level3, out_tgt_enc_level3], dim=1))
+        # ── ENCODER Level 4 (Latent) ──
+        inp_tgt_enc_level4 = self.pos_l4(self.down3_4(out_tgt_enc_level3, tgt_mask_quarter))
+        inp_dif_enc_level4 = self.pos_l4(self.down3_4(out_dif_enc_level3, dif_mask_quarter))
+        inp_ref_enc_level4 = self.pos_l4(self.down3_4(out_ref_enc_level3, ref_mask_quarter))
+        ref_latent_out = inp_ref_enc_level4
+        for block in self.img_latent:
+            ref_latent_out = block(ref_latent_out)
+        kv_level4 = ref_latent_out
+        tgt_latent_out = inp_tgt_enc_level4
+        for block in self.map_latent:
+            tgt_latent_out = block(tgt_latent_out, kv_level4)
+        dif_latent_out = inp_dif_enc_level4
+        for block in self.img_latent:
+            dif_latent_out = block(dif_latent_out, kv_level4)
+        latent_out = self.qfuse_l4(torch.cat([dif_latent_out, tgt_latent_out], dim=1))
+        # ── DECODER ──
+        inp_dec_level3 = self.up4_3(latent_out, dif_mask_tiny)
+        inp_dec_level3 = torch.cat([inp_dec_level3, out_dif_enc_level3], 1)
+        inp_dec_level3 = self.pos_d3(self.reduce_chan_level3(inp_dec_level3))
+        out_dec_level3 = inp_dec_level3
+        for block in self.decoder_level3:
+            out_dec_level3 = block(out_dec_level3)
+        inp_dec_level2 = self.up3_2(out_dec_level3, dif_mask_quarter)
+        inp_dec_level2 = torch.cat([inp_dec_level2, out_dif_enc_level2], 1)
+        inp_dec_level2 = self.pos_d2(self.reduce_chan_level2(inp_dec_level2))
+        out_dec_level2 = inp_dec_level2
+        for block in self.decoder_level2:
+            out_dec_level2 = block(out_dec_level2)
+        inp_dec_level1 = self.up2_1(out_dec_level2, dif_mask_half)
+        inp_dec_level1 = torch.cat([inp_dec_level1, out_dif_enc_level1], 1)
+        inp_dec_level1 = self.pos_d1(inp_dec_level1)
+        out_dec_level1 = inp_dec_level1
+        for block in self.decoder_level1:
+            out_dec_level1 = block(out_dec_level1)
+        return torch.tanh(self.output(out_dec_level1))

pr_iqa/partial_map/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .feature_metric import FeatureMetric
2	+
3	+ __all__ = ["FeatureMetric"]

pr_iqa/partial_map/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (214 Bytes). View file

pr_iqa/partial_map/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (212 Bytes). View file

pr_iqa/partial_map/__pycache__/feature_metric.cpython-310.pyc ADDED Viewed

Binary file (8.03 kB). View file

pr_iqa/partial_map/__pycache__/feature_metric.cpython-38.pyc ADDED Viewed

Binary file (7.91 kB). View file

pr_iqa/partial_map/feature_metric.py ADDED Viewed

	@@ -0,0 +1,285 @@

+"""
+FeatureMetric: DINOv2 + LoftUp feature-based quality metric.
+Generates partial quality maps by:
+  1. Extracting DINOv2 features (upsampled via LoftUp) from input images
+  2. Using VGGT for monocular depth and pose estimation
+  3. Constructing a colored 3D point cloud with features
+  4. Rendering the point cloud from the target viewpoint via PyTorch3D
+  5. Computing cosine similarity between rendered features and target features
+Two modes:
+  - partial_generation=True: Full 3D pipeline → partial map + overlap mask
+  - partial_generation=False: Direct cosine similarity → total quality map
+Dependencies (Level 1):
+  - VGGT (facebook/VGGT-1B)
+  - LoftUp (andrehuang/loftup)
+  - PyTorch3D
+"""
+import sys
+import torch
+from torch import Tensor
+from torch.nn import Module
+import numpy as np
+from typing import Optional, Tuple, Union
+from pathlib import Path
+from einops import rearrange
+# Auto-detect submodule paths
+_THIS_DIR = Path(__file__).resolve().parent
+_REPO_ROOT = _THIS_DIR.parent.parent
+_SUBMODULES = _REPO_ROOT / "submodules"
+if (_SUBMODULES / "vggt").exists():
+    sys.path.insert(0, str(_SUBMODULES / "vggt"))
+if (_SUBMODULES / "loftup").exists():
+    sys.path.insert(0, str(_SUBMODULES / "loftup"))
+# Lazy imports for heavy dependencies — loaded on first use
+_VGGT = None
+_LOFTUP_FEATURIZERS = None
+_LOFTUP_UPSAMPLERS = None
+_PYTORCH3D = None
+def _import_vggt():
+    global _VGGT
+    if _VGGT is None:
+        from vggt.models.vggt import VGGT as _V
+        from vggt.utils.pose_enc import pose_encoding_to_extri_intri as _pe
+        from vggt.utils.geometry import unproject_depth_map_to_point_map as _ud
+        from vggt.utils.load_fn import load_and_preprocess_images as _lpi
+        _VGGT = {"VGGT": _V, "pose_encoding_to_extri_intri": _pe,
+                 "unproject_depth_map_to_point_map": _ud,
+                 "load_and_preprocess_images": _lpi}
+    return _VGGT
+def _import_loftup():
+    global _LOFTUP_FEATURIZERS, _LOFTUP_UPSAMPLERS
+    if _LOFTUP_FEATURIZERS is None:
+        from featurizers import get_featurizer as _gf
+        from upsamplers import norm as _n
+        _LOFTUP_FEATURIZERS = _gf
+        _LOFTUP_UPSAMPLERS = _n
+    return _LOFTUP_FEATURIZERS, _LOFTUP_UPSAMPLERS
+def _import_pytorch3d():
+    global _PYTORCH3D
+    if _PYTORCH3D is None:
+        from pytorch3d.structures import Pointclouds
+        from pytorch3d.renderer import (
+            PointsRasterizationSettings,
+            PointsRasterizer,
+            AlphaCompositor,
+        )
+        from pytorch3d.renderer.camera_conversions import _cameras_from_opencv_projection
+        _PYTORCH3D = {
+            "Pointclouds": Pointclouds,
+            "PointsRasterizationSettings": PointsRasterizationSettings,
+            "PointsRasterizer": PointsRasterizer,
+            "AlphaCompositor": AlphaCompositor,
+            "_cameras_from_opencv_projection": _cameras_from_opencv_projection,
+        }
+    return _PYTORCH3D
+class FeatureMetric(Module):
+    """DINOv2 + LoftUp + VGGT → partial / total quality map.
+    Args:
+        img_size: Inference image size (controls rasterizer resolution).
+        feature_backbone: Name of the feature backbone (default: ``"dinov2"``).
+        loftup_torch_hub: Torch Hub repository for LoftUp.
+        loftup_model_name: LoftUp model name.
+        vggt_weights: HuggingFace model ID for VGGT.
+        use_vggt: Load VGGT for depth/pose estimation.
+        use_loftup: Load LoftUp for feature upsampling.
+    """
+    def __init__(
+        self,
+        img_size: int = 256,
+        feature_backbone: str = "dinov2",
+        loftup_torch_hub: Union[str, Path] = "andrehuang/loftup",
+        loftup_model_name: Union[str, Path] = "loftup_dinov2s",
+        vggt_weights: Union[str, Path] = "facebook/VGGT-1B",
+        use_vggt: bool = True,
+        use_loftup: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.img_size = img_size
+        get_featurizer, _ = _import_loftup()
+        self.feature_backbone, self.patch_size, self.dim = get_featurizer(feature_backbone)
+        self.upsampler = (
+            torch.hub.load(loftup_torch_hub, loftup_model_name, pretrained=True)
+            if use_loftup else None
+        )
+        self.use_loftup = use_loftup
+        if use_vggt:
+            vggt_mod = _import_vggt()
+            self.vggt = vggt_mod["VGGT"].from_pretrained(vggt_weights)
+        p3d = _import_pytorch3d()
+        self.compositor = p3d["AlphaCompositor"]()
+    def _render(self, point_clouds, **kwargs):
+        """Render point cloud features to images."""
+        with torch.autocast("cuda", enabled=False):
+            fragments = self.rasterizer(point_clouds, **kwargs)
+        r = self.rasterizer.raster_settings.radius
+        dists2 = fragments.dists.permute(0, 3, 1, 2)
+        weights = 1 - dists2 / (r * r)
+        images = self.compositor(
+            fragments.idx.long().permute(0, 3, 1, 2),
+            weights,
+            point_clouds.features_packed().permute(1, 0),
+            **kwargs,
+        )
+        images = images.permute(0, 2, 3, 1)
+        return images, fragments.zbuf
+    @torch.no_grad()
+    def forward(
+        self,
+        device: str,
+        images: Tensor,                    # (K, 3, H, W)
+        return_overlap_mask: bool = False,
+        return_score_map: bool = False,
+        return_projections: bool = False,
+        partial_generation: bool = False,
+        use_filtering: bool = False,
+    ) -> Tuple[float, Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+        """Compute quality score map.
+        Args:
+            device: Torch device string.
+            images: (K, 3, H, W) input images. First image is the target.
+            partial_generation: If True, use full 3D pipeline for partial map.
+        Returns:
+            (score_scalar, overlap_mask, score_map, projections)
+        """
+        k, c, h, w = images.shape
+        p3d = _import_pytorch3d()
+        _, norm_fn = _import_loftup()
+        # Setup rasterizer
+        raster_settings = p3d["PointsRasterizationSettings"](
+            image_size=(h, w), radius=0.01, points_per_pixel=10, bin_size=0,
+        )
+        self.rasterizer = p3d["PointsRasterizer"](cameras=None, raster_settings=raster_settings)
+        # Extract features
+        images_norm = norm_fn(images)
+        hr_feats = []
+        for i in range(k):
+            img = images_norm[i:i + 1]
+            lr_feat = self.feature_backbone(img)
+            if self.use_loftup and self.upsampler is not None:
+                hr_feat = self.upsampler(lr_feat, img)
+            else:
+                hr_feat = lr_feat
+            hr_feat = rearrange(hr_feat, "b c h w -> b (h w) c")
+            hr_feats.append(hr_feat)
+        hr_feats = torch.cat(hr_feats, dim=0)
+        if not partial_generation:
+            # Fast cosine similarity mode
+            dot = (hr_feats[0] * hr_feats[1]).sum(dim=1)
+            tgt_norm = torch.linalg.norm(hr_feats[0], dim=1)
+            ref_norm = torch.linalg.norm(hr_feats[1], dim=1)
+            cosine_sim = dot / (tgt_norm * ref_norm + 1e-8)
+            score_map = torch.clamp(cosine_sim, min=0.0, max=1.0)
+            if self.use_loftup and self.upsampler is not None:
+                H_out, W_out = h, w
+            else:
+                H_out = h // self.patch_size
+                W_out = w // self.patch_size
+            score_map = score_map.reshape(H_out, W_out).unsqueeze(0)
+            return score_map.mean().item(), None, score_map if return_score_map else None, None
+        # Full 3D partial map generation
+        vggt_mod = _import_vggt()
+        pose_encoding_to_extri_intri = vggt_mod["pose_encoding_to_extri_intri"]
+        unproject_depth_map_to_point_map = vggt_mod["unproject_depth_map_to_point_map"]
+        preds = self.vggt(images)
+        extrinsic, intrinsic = pose_encoding_to_extri_intri(preds["pose_enc"], images.shape[-2:])
+        depth, depth_conf = preds["depth"], preds["depth_conf"]
+        point_map = unproject_depth_map_to_point_map(
+            depth.squeeze(0), extrinsic.squeeze(0), intrinsic.squeeze(0),
+        )
+        cols = images.cpu().numpy().transpose(0, 2, 3, 1)
+        cols = cols / cols.max()
+        pts_flatten = torch.from_numpy(
+            rearrange(point_map, "k h w c -> k (h w) c")
+        ).float().to(device)
+        if use_filtering:
+            percent = 20
+            quantile = torch.quantile(depth_conf, percent / 100.0)
+            mask_flat = rearrange((depth_conf > quantile).squeeze(0), "k h w -> k (h w)")
+            points_list, features_list = [], []
+            for i in range(k):
+                valid = mask_flat[i]
+                points_list.append(pts_flatten[i][valid])
+                features_list.append(hr_feats[i][valid])
+            point_clouds = p3d["Pointclouds"](points=points_list, features=features_list)
+        else:
+            point_clouds = p3d["Pointclouds"](points=pts_flatten, features=hr_feats)
+        # Render from target viewpoint
+        extrinsic, intrinsic = pose_encoding_to_extri_intri(preds["pose_enc"], images.shape[-2:])
+        E, K = extrinsic.squeeze(0), intrinsic.squeeze(0)
+        R0, T0, K0 = E[0, :3, :3], E[0, :3, 3], K[0]
+        B = pts_flatten.shape[0]
+        R_repeat = R0.unsqueeze(0).repeat(B, 1, 1)
+        T_repeat = T0.unsqueeze(0).repeat(B, 1)
+        K_repeat = K0.unsqueeze(0).repeat(B, 1, 1)
+        im_size = torch.tensor([[h, w]]).repeat(B, 1).to(device)
+        cameras_p3d = p3d["_cameras_from_opencv_projection"](R_repeat, T_repeat, K_repeat, im_size)
+        with torch.autocast("cuda", enabled=False):
+            bg_color = torch.tensor(
+                [-10000] * hr_feats[0].shape[-1], dtype=torch.float32, device=device,
+            )
+            rendering, zbuf = self._render(point_clouds, cameras=cameras_p3d, background_color=bg_color)
+        rendering = rearrange(rendering, "k h w c -> k c h w")
+        # Cosine similarity score map
+        target = rendering[0:1]
+        reference = rendering[1:]
+        dot = (reference * target).sum(dim=1)
+        tgt_norm = torch.linalg.norm(target, dim=1)
+        ref_norm = torch.linalg.norm(reference, dim=1)
+        cosine_sim = dot / (tgt_norm * ref_norm + 1e-8)
+        score_map = torch.clamp(cosine_sim, min=0.0, max=1.0)
+        # Mask true background
+        target_mask = zbuf[0, ..., 0] >= 0
+        reference_mask = zbuf[1:, ..., 0] >= 0
+        true_bg = ~target_mask & ~torch.any(reference_mask, dim=0)
+        score_map[:, true_bg] = 0.0
+        overlap_mask = zbuf[1:, ..., 0] >= 0
+        return (
+            score_map.mean().item(),
+            overlap_mask if return_overlap_mask else None,
+            score_map if return_score_map else None,
+            rendering if return_projections else None,
+        )

pr_iqa/transforms.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Data transforms and batch preparation utilities for PR-IQA training.
+ImageNet normalization is applied to RGB inputs.
+Grayscale inputs (partial maps, masks) are kept in [0, 1].
+"""
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+# ImageNet normalization constants
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_rgb_transform(img_size: int = 256) -> T.Compose:
+    """Transform for RGB images: resize → tensor → ImageNet normalize."""
+    return T.Compose([
+        T.Resize((img_size, img_size)),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+    ])
+def build_grey_transform(img_size: int = 256) -> T.Compose:
+    """Transform for grayscale images (maps/masks): resize → tensor [0,1]."""
+    return T.Compose([
+        T.Resize((img_size, img_size)),
+        T.ToTensor(),
+    ])
+def make_pyramid_masks(mask_whole: torch.Tensor):
+    """Build 3 downscaled masks from (B, 1, H, W) → half, quarter, tiny."""
+    mask_half = F.interpolate(mask_whole, scale_factor=0.5, mode="nearest")
+    mask_quarter = F.interpolate(mask_whole, scale_factor=0.25, mode="nearest")
+    mask_tiny = F.interpolate(mask_whole, scale_factor=0.125, mode="nearest")
+    return mask_half, mask_quarter, mask_tiny
+def prepare_batch(batch: dict, device: torch.device):
+    """Prepare a training batch for the PR-IQA model.
+    Takes a dataset batch dict and returns (model_args, gt) where
+    model_args is a tuple of 15 tensors matching PRIQA.forward() signature.
+    Returns:
+        model_args: (tgt_img, dif_img, ref_img, + 12 mask tensors)
+        gt: (B, 1, H, W) ground truth quality map
+    """
+    dtype = torch.bfloat16
+    dif_img = batch["tgt_diff"].to(device, dtype=dtype, non_blocking=True,
+                                   memory_format=torch.channels_last)
+    tgt_mask_whole = batch["partial_mask"].to(device, dtype=dtype, non_blocking=True,
+                                              memory_format=torch.channels_last)
+    tgt_img_1ch = batch["partial_map"].to(device, dtype=dtype, non_blocking=True,
+                                          memory_format=torch.channels_last)
+    tgt_img = tgt_img_1ch.repeat(1, 3, 1, 1)
+    ref_img = batch["current_ref"].to(device, dtype=dtype, non_blocking=True,
+                                      memory_format=torch.channels_last)
+    gt = batch["full_map"].to(device, dtype=dtype, non_blocking=True,
+                              memory_format=torch.channels_last)
+    tgt_mask_half, tgt_mask_quarter, tgt_mask_tiny = make_pyramid_masks(tgt_mask_whole)
+    ones = torch.ones_like
+    dif_mask_whole = ones(tgt_mask_whole)
+    dif_mask_half = ones(tgt_mask_half)
+    dif_mask_quarter = ones(tgt_mask_quarter)
+    dif_mask_tiny = ones(tgt_mask_tiny)
+    ref_mask_whole = ones(tgt_mask_whole)
+    ref_mask_half = ones(tgt_mask_half)
+    ref_mask_quarter = ones(tgt_mask_quarter)
+    ref_mask_tiny = ones(tgt_mask_tiny)
+    model_args = (
+        tgt_img, dif_img, ref_img,
+        tgt_mask_whole, tgt_mask_half, tgt_mask_quarter, tgt_mask_tiny,
+        dif_mask_whole, dif_mask_half, dif_mask_quarter, dif_mask_tiny,
+        ref_mask_whole, ref_mask_half, ref_mask_quarter, ref_mask_tiny,
+    )
+    return model_args, gt