abhivsh commited on
Commit
baf8731
Β·
verified Β·
1 Parent(s): e46db36

Upload app.py

Browse files

deleted unnecessary steps and output size reduced

Files changed (1) hide show
  1. app.py +2051 -0
app.py ADDED
@@ -0,0 +1,2051 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py β€” POWERGRID Document Auditor (single-file HuggingFace Spaces build)
3
+ =============================================================================
4
+ Single-file Gradio app for AI-powered engineering drawing comparison.
5
+ Designed for POWERGRID (765/400/132kV AIS/GIS vendor drawing audits).
6
+
7
+ Pipeline:
8
+ Stage 1 β€” Global Alignment : Phase Correlation + ORB/RANSAC homography
9
+ Stage 2 β€” Region Extraction : Content-aware morphology (no pretrained detector)
10
+ Stage 3 β€” Semantic Matching : ResNet50 embeddings + cosine similarity (position-agnostic)
11
+ Stage 4 β€” Siamese Comparison : ResNet50 patch embeddings + GradCAM heatmaps
12
+
13
+ Run locally:
14
+ python app.py
15
+ """
16
+
17
+ # ══════════════════════════════════════════════════════════════════════
18
+ # IMPORTS
19
+ # ══════════════════════════════════════════════════════════════════════
20
+
21
+ import base64
22
+ import io
23
+ import logging
24
+ import os
25
+ import time
26
+ from dataclasses import dataclass, field
27
+ from typing import Dict, List, Optional, Tuple
28
+
29
+ import cv2
30
+ import fitz # PyMuPDF
31
+ import gradio as gr
32
+ import numpy as np
33
+ import torch
34
+ import torch.nn as nn
35
+ import torch.nn.functional as F
36
+ from PIL import Image
37
+ from scipy.optimize import linear_sum_assignment
38
+ from skimage.metrics import structural_similarity as ssim
39
+ from torchvision import models, transforms
40
+
41
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
42
+ logger = logging.getLogger(__name__)
43
+
44
+ # ── Logo: embed as base64 so it works on HuggingFace Spaces (no static folder) ──
45
+ def _load_logo_b64(filename: str = "logo_0.png") -> str:
46
+ """Return a data-URI string for the logo, or empty string if file not found."""
47
+ logo_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
48
+ if os.path.exists(logo_path):
49
+ with open(logo_path, "rb") as f:
50
+ b64 = base64.b64encode(f.read()).decode("utf-8")
51
+ ext = filename.rsplit(".", 1)[-1].lower()
52
+ mime = "image/png" if ext == "png" else f"image/{ext}"
53
+ return f"data:{mime};base64,{b64}"
54
+ return ""
55
+
56
+ _LOGO_URI = _load_logo_b64("logo_0.png")
57
+
58
+
59
+ # ══════════════════════════════════════════════════════════════════════
60
+ # DATA STRUCTURES
61
+ # ══════════════════════════════════════════════════════════════════════
62
+
63
+ @dataclass
64
+ class Region:
65
+ """A detected layout region (axis-aligned bounding box)."""
66
+ x: int
67
+ y: int
68
+ w: int
69
+ h: int
70
+ label: str = "text_block" # text_block | figure | table | margin
71
+ confidence: float = 1.0
72
+
73
+ @property
74
+ def bbox(self) -> Tuple[int, int, int, int]:
75
+ return (self.x, self.y, self.x + self.w, self.y + self.h)
76
+
77
+ @property
78
+ def area(self) -> int:
79
+ return self.w * self.h
80
+
81
+ @property
82
+ def center(self) -> Tuple[float, float]:
83
+ return (self.x + self.w / 2.0, self.y + self.h / 2.0)
84
+
85
+ def iou(self, other: "Region") -> float:
86
+ xa = max(self.x, other.x)
87
+ ya = max(self.y, other.y)
88
+ xb = min(self.x + self.w, other.x + other.w)
89
+ yb = min(self.y + self.h, other.y + other.h)
90
+ inter = max(0, xb - xa) * max(0, yb - ya)
91
+ union = self.area + other.area - inter
92
+ return inter / union if union > 0 else 0.0
93
+
94
+
95
+ @dataclass
96
+ class MatchedPair:
97
+ """A matched region pair between old and new documents."""
98
+ region_old: Region
99
+ region_new: Region
100
+ match_score: float
101
+ position_cost: float
102
+ appearance_cost: float
103
+ pixel_diff: float = 0.0
104
+ ssim_score: float = 1.0
105
+ semantic_diff: float = 0.0
106
+ total_change: float = 0.0
107
+ heatmap: Optional[np.ndarray] = None
108
+
109
+
110
+ @dataclass
111
+ class ComparisonResult:
112
+ """Full comparison result for one document page."""
113
+ matched_pairs: List[MatchedPair]
114
+ unmatched_old: List[Region]
115
+ unmatched_new: List[Region]
116
+ global_transform: Optional[np.ndarray]
117
+ total_change_pct: float
118
+ heatmap: np.ndarray
119
+ img_old_aligned: Optional[np.ndarray] = None # aligned OLD, same coord-space as NEW
120
+
121
+ def summary(self) -> str:
122
+ lines = [
123
+ f" Global Alignment : {'Applied' if self.global_transform is not None else 'Skipped'}",
124
+ f" Matched Pairs : {len(self.matched_pairs)}",
125
+ f" Deleted Regions : {len(self.unmatched_old)}",
126
+ f" Added Regions : {len(self.unmatched_new)}",
127
+ f" Total Change : {self.total_change_pct:.1f}%",
128
+ ]
129
+ changed = [p for p in self.matched_pairs if p.total_change > 0.05]
130
+ if changed:
131
+ avg_chg = np.mean([p.total_change for p in changed])
132
+ lines.append(f" Avg Change (modified regions): {avg_chg:.2f}")
133
+ return "\n".join(lines)
134
+
135
+
136
+ # ══════════════════════════════════════════════════════════════════════
137
+ # STAGE 1 β€” GLOBAL ALIGNER
138
+ # ══════════════════════════════════════════════════════════════════════
139
+
140
+ class GlobalAligner:
141
+ def __init__(self, orb_features: int = 2000, ransac_threshold: float = 5.0):
142
+ self.orb_features = orb_features
143
+ self.ransac_threshold = ransac_threshold
144
+
145
+ def _phase_correlation_shift(self, gray1: np.ndarray, gray2: np.ndarray) -> Tuple[float, float]:
146
+ f1 = np.fft.fft2(gray1.astype(np.float32))
147
+ f2 = np.fft.fft2(gray2.astype(np.float32))
148
+ denom = np.abs(f1 * np.conj(f2)) + 1e-10
149
+ cross = (f1 * np.conj(f2)) / denom
150
+ corr = np.fft.ifft2(cross).real
151
+ y_shift, x_shift = np.unravel_index(np.argmax(corr), corr.shape)
152
+ h, w = gray1.shape
153
+ if y_shift > h // 2:
154
+ y_shift -= h
155
+ if x_shift > w // 2:
156
+ x_shift -= w
157
+ return float(-x_shift), float(-y_shift)
158
+
159
+ def _orb_affine(self, gray_old: np.ndarray, gray_new: np.ndarray) -> Optional[np.ndarray]:
160
+ orb = cv2.ORB_create(nfeatures=self.orb_features)
161
+ kp1, des1 = orb.detectAndCompute(gray_old, None)
162
+ kp2, des2 = orb.detectAndCompute(gray_new, None)
163
+ if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
164
+ return None
165
+ bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
166
+ matches = sorted(bf.match(des1, des2), key=lambda m: m.distance)
167
+ if len(matches) < 10:
168
+ return None
169
+ top_k = min(200, len(matches))
170
+ # src = OLD keypoints, dst = NEW keypoints
171
+ # → M maps OLD→NEW (forward transform), which is what warpAffine expects:
172
+ # warpAffine(img_old, M, size) correctly places OLD pixels at their NEW positions.
173
+ # BUG that was here: src/dst were swapped (kp2/NEW as src, kp1/OLD as dst),
174
+ # giving M that mapped NEW→OLD. warpAffine then doubled the displacement
175
+ # instead of correcting it, causing the full-image red/cyan fringe seen in
176
+ # the Alignment Check view.
177
+ src_pts = np.float32([kp1[m.queryIdx].pt for m in matches[:top_k]]).reshape(-1, 1, 2)
178
+ dst_pts = np.float32([kp2[m.trainIdx].pt for m in matches[:top_k]]).reshape(-1, 1, 2)
179
+ M, mask = cv2.estimateAffinePartial2D(
180
+ src_pts, dst_pts, method=cv2.RANSAC,
181
+ ransacReprojThreshold=self.ransac_threshold,
182
+ )
183
+ return M
184
+
185
+ def align(self, img_old: np.ndarray, img_new: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
186
+ g_old = cv2.cvtColor(img_old, cv2.COLOR_RGB2GRAY)
187
+ g_new = cv2.cvtColor(img_new, cv2.COLOR_RGB2GRAY)
188
+ dx, dy = self._phase_correlation_shift(g_old, g_new)
189
+ M = self._orb_affine(g_old, g_new)
190
+ if M is None:
191
+ M = np.array([[1.0, 0.0, dx], [0.0, 1.0, dy]], dtype=np.float32)
192
+ h, w = img_old.shape[:2]
193
+ aligned = cv2.warpAffine(
194
+ img_old, M, (w, h),
195
+ flags=cv2.INTER_LINEAR,
196
+ borderMode=cv2.BORDER_CONSTANT,
197
+ borderValue=(255, 255, 255),
198
+ )
199
+ return aligned, M
200
+
201
+
202
+ # ══════════════════════════════════════════════════════════════════════
203
+ # STAGE 2 β€” LAYOUT REGION EXTRACTOR
204
+ # ══════════════════════════════════════════════════════════════════════
205
+
206
+ class LayoutRegionExtractor:
207
+ def __init__(
208
+ self,
209
+ min_area_ratio: float = 0.0003,
210
+ max_area_ratio: float = 0.92,
211
+ dilation_kernel: Tuple[int, int] = (8, 2),
212
+ dilation_iters: int = 2,
213
+ merge_iou_threshold: float = 0.40,
214
+ ):
215
+ self.min_area_ratio = min_area_ratio
216
+ self.max_area_ratio = max_area_ratio
217
+ self.dilation_kernel = dilation_kernel
218
+ self.dilation_iters = dilation_iters
219
+ self.merge_iou_threshold = merge_iou_threshold
220
+
221
+ def _binarise(self, gray: np.ndarray) -> np.ndarray:
222
+ blurred = cv2.GaussianBlur(gray, (5, 5), 0)
223
+ _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
224
+ return binary
225
+
226
+ def _dilate(self, binary: np.ndarray) -> np.ndarray:
227
+ k = cv2.getStructuringElement(cv2.MORPH_RECT, self.dilation_kernel)
228
+ dilated = cv2.dilate(binary, k, iterations=self.dilation_iters)
229
+ k_line = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 1))
230
+ dilated = cv2.dilate(dilated, k_line, iterations=1)
231
+ k_vert = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 8))
232
+ return cv2.morphologyEx(dilated, cv2.MORPH_CLOSE, k_vert)
233
+
234
+ def _classify(self, patch_gray: np.ndarray, w: int, h: int) -> str:
235
+ aspect = w / max(h, 1)
236
+ _, binary = cv2.threshold(patch_gray, 127, 255, cv2.THRESH_BINARY_INV)
237
+ density = np.sum(binary > 0) / max(w * h, 1)
238
+ if density < 0.02:
239
+ contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
240
+ if len(contours) < 3:
241
+ return "margin"
242
+ if aspect > 4.0 and density > 0.06:
243
+ return "text_block"
244
+ if 0.4 < aspect < 2.8 and density < 0.25:
245
+ return "figure"
246
+ if density > 0.18 and aspect > 1.0:
247
+ return "table"
248
+ return "text_block"
249
+
250
+ def _merge_overlapping(self, regions: List[Region]) -> List[Region]:
251
+ changed = True
252
+ while changed:
253
+ changed = False
254
+ used = [False] * len(regions)
255
+ merged: List[Region] = []
256
+ for i, r1 in enumerate(regions):
257
+ if used[i]:
258
+ continue
259
+ x0, y0 = r1.x, r1.y
260
+ x1, y1 = r1.x + r1.w, r1.y + r1.h
261
+ for j, r2 in enumerate(regions):
262
+ if i == j or used[j]:
263
+ continue
264
+ expanded = Region(x0, y0, x1 - x0, y1 - y0)
265
+ if expanded.iou(r2) > self.merge_iou_threshold:
266
+ x0 = min(x0, r2.x)
267
+ y0 = min(y0, r2.y)
268
+ x1 = max(x1, r2.x + r2.w)
269
+ y1 = max(y1, r2.y + r2.h)
270
+ used[j] = True
271
+ changed = True
272
+ merged.append(Region(x0, y0, x1 - x0, y1 - y0))
273
+ used[i] = True
274
+ regions = merged
275
+ return regions
276
+
277
+ def extract(self, img_rgb: np.ndarray) -> List[Region]:
278
+ h, w = img_rgb.shape[:2]
279
+ page_area = h * w
280
+ gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
281
+ binary = self._binarise(gray)
282
+ dilated = self._dilate(binary)
283
+ contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
284
+ candidates: List[Region] = []
285
+ for cnt in contours:
286
+ rx, ry, rw, rh = cv2.boundingRect(cnt)
287
+ area = rw * rh
288
+ if area < page_area * self.min_area_ratio:
289
+ continue
290
+ if area > page_area * self.max_area_ratio:
291
+ continue
292
+ patch = gray[ry: ry + rh, rx: rx + rw]
293
+ label = self._classify(patch, rw, rh)
294
+ if label == "margin":
295
+ continue
296
+ candidates.append(Region(rx, ry, rw, rh, label=label))
297
+ regions = self._merge_overlapping(candidates)
298
+ regions.sort(key=lambda r: (r.y // 50, r.x))
299
+ logger.info("LayoutExtractor: %d regions detected", len(regions))
300
+ return regions
301
+
302
+
303
+ # ══════════════════════════════════════════════════════════════════════
304
+ # STAGE 3 β€” HUNGARIAN REGION MATCHER
305
+ # ══════════════════════════════════════════════════════════════════════
306
+
307
+ # ══════════════════════════════════════════════════════════════════════
308
+ # STAGE 3 β€” SEMANTIC RETRIEVAL MATCHER (position-agnostic)
309
+ # ══════════════════════════════════════════════════════════════════════
310
+
311
+ class SemanticRetrievalMatcher:
312
+ """
313
+ Replaces HungarianRegionMatcher for layout-shift-robust document comparison.
314
+
315
+ Strategy
316
+ --------
317
+ For every region in the NEW page:
318
+ 1. Extract the patch image from the NEW document.
319
+ 2. Encode it with the shared ResNet50 backbone β†’ 128-d L2-normalised vector.
320
+ Simultaneously encode every OLD region patch.
321
+ Build an (N_new Γ— N_old) cosine-similarity matrix.
322
+ Run scipy.linear_sum_assignment on βˆ’similarity (maximise similarity).
323
+ Accept a pair only when similarity β‰₯ min_similarity.
324
+
325
+ This means a region that has *moved* (different x/y) but is otherwise
326
+ identical will still get similarity β‰ˆ 1.0 and be matched correctly.
327
+ """
328
+
329
+ def __init__(
330
+ self,
331
+ encoder: "_SiameseEncoder",
332
+ device: torch.device,
333
+ min_similarity: float = 0.50,
334
+ thumbnail_size: Tuple[int, int] = (224, 224),
335
+ ):
336
+ self.encoder = encoder
337
+ self.device = device
338
+ self.min_similarity = min_similarity
339
+ self._transform = transforms.Compose([
340
+ transforms.Resize(thumbnail_size),
341
+ transforms.ToTensor(),
342
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
343
+ std=[0.229, 0.224, 0.225]),
344
+ ])
345
+
346
+ # ------------------------------------------------------------------
347
+ def _patch(self, region: Region, img: np.ndarray) -> np.ndarray:
348
+ """Crop a region from the image; returns white 64Γ—64 if empty."""
349
+ p = img[region.y: region.y + region.h, region.x: region.x + region.w]
350
+ if p.size == 0:
351
+ p = np.full((64, 64, 3), 255, dtype=np.uint8)
352
+ return p
353
+
354
+ def _embed(self, patches: List[np.ndarray]) -> torch.Tensor:
355
+ """
356
+ Batch-encode a list of patches β†’ (N, 128) normalised embedding tensor.
357
+ Runs entirely on self.device with no gradient.
358
+ """
359
+ tensors = [
360
+ self._transform(Image.fromarray(p)) for p in patches
361
+ ]
362
+ batch = torch.stack(tensors).to(self.device) # (N, 3, 224, 224)
363
+ with torch.no_grad():
364
+ embeddings, _ = self.encoder.encode(batch) # (N, 128) β€” already L2-normed
365
+ return embeddings
366
+
367
+ # ------------------------------------------------------------------
368
+ def match(
369
+ self,
370
+ regions_old: List[Region],
371
+ regions_new: List[Region],
372
+ img_old: np.ndarray,
373
+ img_new: np.ndarray,
374
+ ) -> Tuple[List[MatchedPair], List[Region], List[Region]]:
375
+ n_old, n_new = len(regions_old), len(regions_new)
376
+ if n_old == 0 or n_new == 0:
377
+ return [], list(regions_old), list(regions_new)
378
+
379
+ # ── 1. Encode both sets of patches ─────────────────────────
380
+ patches_old = [self._patch(r, img_old) for r in regions_old]
381
+ patches_new = [self._patch(r, img_new) for r in regions_new]
382
+
383
+ emb_old = self._embed(patches_old) # (n_old, 128)
384
+ emb_new = self._embed(patches_new) # (n_new, 128)
385
+
386
+ # ── 2. Cosine similarity matrix: rows=NEW, cols=OLD ─────────
387
+ # L2-normed β†’ dot product == cosine similarity
388
+ sim_mat = torch.mm(emb_new, emb_old.T).cpu().numpy() # (n_new, n_old)
389
+
390
+ # ── 3. Hungarian assignment on βˆ’similarity ──────────────────
391
+ row_ind, col_ind = linear_sum_assignment(-sim_mat) # maximise sim
392
+
393
+ matched_pairs: List[MatchedPair] = []
394
+ matched_old_idx: set = set()
395
+ matched_new_idx: set = set()
396
+
397
+ for ri, ci in zip(row_ind, col_ind):
398
+ sim = float(sim_mat[ri, ci])
399
+ if sim < self.min_similarity:
400
+ continue # below threshold β†’ treat as unmatched
401
+ matched_pairs.append(MatchedPair(
402
+ region_old = regions_old[ci],
403
+ region_new = regions_new[ri],
404
+ match_score = sim,
405
+ position_cost = 0.0, # no position penalty
406
+ appearance_cost= max(0.0, 1.0 - sim),
407
+ ))
408
+ matched_old_idx.add(ci)
409
+ matched_new_idx.add(ri)
410
+
411
+ unmatched_old = [regions_old[i] for i in range(n_old) if i not in matched_old_idx]
412
+ unmatched_new = [regions_new[j] for j in range(n_new) if j not in matched_new_idx]
413
+
414
+ logger.info(
415
+ "SemanticRetrieval: %d matched | %d deleted | %d added "
416
+ "(min_sim=%.2f)",
417
+ len(matched_pairs), len(unmatched_old), len(unmatched_new),
418
+ self.min_similarity,
419
+ )
420
+ return matched_pairs, unmatched_old, unmatched_new
421
+
422
+
423
+ # ══════════════════════════════════════════════════════════════════════
424
+ # STAGE 4 β€” SIAMESE PATCH COMPARATOR
425
+ # ══════════════════════════════════════════════════════════════════════
426
+
427
+ class _SiameseEncoder(nn.Module):
428
+ def __init__(self):
429
+ super().__init__()
430
+ resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
431
+ self.features = nn.Sequential(*list(resnet.children())[:-2])
432
+ self.pool = resnet.avgpool
433
+ self.embed = nn.Sequential(
434
+ nn.Linear(2048, 512), nn.ReLU(),
435
+ nn.Linear(512, 128),
436
+ )
437
+
438
+ def encode(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
439
+ feat_map = self.features(x)
440
+ pooled = torch.flatten(self.pool(feat_map), 1)
441
+ embed = F.normalize(self.embed(pooled), p=2, dim=1)
442
+ return embed, feat_map
443
+
444
+ def forward(self, x1: torch.Tensor, x2: torch.Tensor):
445
+ e1, f1 = self.encode(x1)
446
+ e2, f2 = self.encode(x2)
447
+ return e1, e2, f1, f2
448
+
449
+
450
+ class SiamesePatchComparator:
451
+ def __init__(
452
+ self,
453
+ device: Optional[torch.device] = None,
454
+ encoder: Optional[_SiameseEncoder] = None, # ← shared encoder
455
+ ):
456
+ if device is None:
457
+ if torch.cuda.is_available():
458
+ device = torch.device("cuda")
459
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
460
+ device = torch.device("mps")
461
+ else:
462
+ device = torch.device("cpu")
463
+ self.device = device
464
+ # Reuse the encoder from SemanticRetrievalMatcher if provided β€”
465
+ # avoids loading ResNet50 weights a second time.
466
+ if encoder is not None:
467
+ self.model = encoder
468
+ logger.info("SiamesePatchComparator: reusing shared encoder on %s", device)
469
+ else:
470
+ self.model = _SiameseEncoder().to(device).eval()
471
+ logger.info("SiamesePatchComparator: created new encoder on %s", device)
472
+ self.transform = transforms.Compose([
473
+ transforms.Resize((224, 224)),
474
+ transforms.ToTensor(),
475
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
476
+ ])
477
+
478
+ def _to_tensor(self, patch_rgb: np.ndarray) -> torch.Tensor:
479
+ return self.transform(Image.fromarray(patch_rgb)).unsqueeze(0).to(self.device)
480
+
481
+ def _grad_cam(
482
+ self,
483
+ patch_old: np.ndarray,
484
+ patch_new: np.ndarray,
485
+ target_hw: Tuple[int, int],
486
+ ) -> np.ndarray:
487
+ """
488
+ Grad-CAM spatial change map β€” WHERE inside the patch the embedding differs.
489
+
490
+ Method
491
+ ------
492
+ 1. Forward patch_old (no grad) β†’ embedding e_old.
493
+ 2. Forward patch_new (with grad, hooks on last conv block) β†’ embedding e_new
494
+ + feature map F captured by forward hook.
495
+ 3. Scalar loss = pairwise_distance(e_old.detach(), e_new).
496
+ 4. loss.backward() β†’ βˆ‚loss/βˆ‚F captured by backward hook.
497
+ 5. Grad-CAM = ReLU( mean_c(βˆ‚loss/βˆ‚F) Β· F ) β†’ (7Γ—7) β†’ upsample to patch size.
498
+
499
+ Pixels with HIGH activation changed the embedding the most β†’ the actual edits.
500
+
501
+ Returns
502
+ -------
503
+ np.ndarray shape (target_hw[0], target_hw[1]), float32, values in [0, 1].
504
+ """
505
+ t_old = self._to_tensor(patch_old)
506
+ t_new = self._to_tensor(patch_new)
507
+
508
+ feat_store: Dict[str, torch.Tensor] = {}
509
+ grad_store: Dict[str, torch.Tensor] = {}
510
+
511
+ # Hook on the last convolutional block of the shared ResNet50
512
+ last_block = self.model.features[-1]
513
+
514
+ def _fwd(module, inp, out):
515
+ feat_store["f"] = out # (1, 2048, 7, 7)
516
+
517
+ def _bwd(module, grad_in, grad_out):
518
+ grad_store["g"] = grad_out[0] # (1, 2048, 7, 7)
519
+
520
+ h_fwd = last_block.register_forward_hook(_fwd)
521
+ h_bwd = last_block.register_full_backward_hook(_bwd)
522
+
523
+ try:
524
+ # e_old β€” no gradient needed, just a reference point
525
+ with torch.no_grad():
526
+ e_old, _ = self.model.encode(t_old)
527
+
528
+ # e_new β€” gradient flows through this path only
529
+ with torch.enable_grad():
530
+ self.model.zero_grad()
531
+ e_new, _ = self.model.encode(t_new)
532
+ dist = F.pairwise_distance(e_old.detach(), e_new)
533
+ dist.backward()
534
+ finally:
535
+ h_fwd.remove()
536
+ h_bwd.remove()
537
+
538
+ if "f" not in feat_store or "g" not in grad_store:
539
+ return np.zeros(target_hw, dtype=np.float32)
540
+
541
+ # Grad-CAM: global-average-pool the gradients, weight feature maps
542
+ weights = grad_store["g"].mean(dim=[2, 3], keepdim=True) # (1,2048,1,1)
543
+ cam = (weights * feat_store["f"]).sum(dim=1).squeeze() # (7, 7)
544
+ cam = F.relu(cam)
545
+
546
+ cam_max = cam.max()
547
+ if cam_max < 1e-8:
548
+ return np.zeros(target_hw, dtype=np.float32)
549
+
550
+ cam = (cam / cam_max).detach().cpu().numpy() # (7, 7) in [0, 1]
551
+
552
+ # Upsample to original patch resolution
553
+ h, w = target_hw
554
+ cam_up = cv2.resize(cam, (w, h), interpolation=cv2.INTER_LINEAR)
555
+ return np.clip(cam_up, 0.0, 1.0).astype(np.float32)
556
+
557
+ def compare(self, patch_old: np.ndarray, patch_new: np.ndarray) -> Dict[str, object]:
558
+ g_old = cv2.cvtColor(patch_old, cv2.COLOR_RGB2GRAY).astype(np.float32)
559
+ g_new = cv2.cvtColor(patch_new, cv2.COLOR_RGB2GRAY).astype(np.float32)
560
+ diff_map = np.abs(g_old - g_new)
561
+ # Threshold of 8 (was 15) β€” CAD drawings have fine lines and small
562
+ # text; a dimension change may shift only a handful of pixels slightly.
563
+ changed_pixels = np.sum(diff_map > 8.0)
564
+ pixel_diff = float(changed_pixels) / max(g_old.size, 1)
565
+ ssim_val = float(ssim(g_old, g_new, data_range=255.0))
566
+ ssim_cost = max(0.0, 1.0 - ssim_val)
567
+ with torch.no_grad():
568
+ t1 = self._to_tensor(patch_old)
569
+ t2 = self._to_tensor(patch_new)
570
+ e1, e2, _, _ = self.model(t1, t2)
571
+ l2_dist = float(F.pairwise_distance(e1, e2).item())
572
+ semantic_diff = min(l2_dist / 10.0, 1.0)
573
+ total = 0.30 * pixel_diff + 0.40 * ssim_cost + 0.30 * semantic_diff
574
+
575
+ # Grad-CAM: spatial map showing WHERE inside this patch the change is
576
+ h, w = patch_new.shape[:2]
577
+ grad_cam_map = self._grad_cam(patch_old, patch_new, (h, w))
578
+
579
+ return {
580
+ "pixel_diff": pixel_diff,
581
+ "ssim_score": ssim_val,
582
+ "semantic_diff":semantic_diff,
583
+ "total_change": min(float(total), 1.0),
584
+ "grad_cam": grad_cam_map, # (h, w) float32 [0,1] ← new
585
+ }
586
+
587
+ def compare_pair(self, pair: MatchedPair, img_old: np.ndarray, img_new: np.ndarray) -> MatchedPair:
588
+ ro, rn = pair.region_old, pair.region_new
589
+ patch_old = img_old[ro.y: ro.y + ro.h, ro.x: ro.x + ro.w]
590
+ patch_new = img_new[rn.y: rn.y + rn.h, rn.x: rn.x + rn.w]
591
+ if patch_old.size == 0 or patch_new.size == 0:
592
+ return pair
593
+ target_h = max(patch_old.shape[0], patch_new.shape[0])
594
+ target_w = max(patch_old.shape[1], patch_new.shape[1])
595
+
596
+ def _pad_white(patch: np.ndarray, th: int, tw: int) -> np.ndarray:
597
+ canvas = np.full((th, tw, patch.shape[2]), 255, dtype=np.uint8)
598
+ canvas[:patch.shape[0], :patch.shape[1]] = patch
599
+ return canvas
600
+
601
+ patch_old_p = _pad_white(patch_old, target_h, target_w)
602
+ patch_new_p = _pad_white(patch_new, target_h, target_w)
603
+ metrics = self.compare(patch_old_p, patch_new_p)
604
+ pair.pixel_diff = metrics["pixel_diff"]
605
+ pair.ssim_score = metrics["ssim_score"]
606
+ pair.semantic_diff = metrics["semantic_diff"]
607
+ pair.total_change = metrics["total_change"]
608
+ # Store Grad-CAM map (sized to the new patch, not the padded version)
609
+ raw_cam = metrics.get("grad_cam")
610
+ if raw_cam is not None:
611
+ rn = pair.region_new
612
+ pair.heatmap = cv2.resize(raw_cam, (rn.w, rn.h),
613
+ interpolation=cv2.INTER_LINEAR)
614
+ return pair
615
+
616
+
617
+ # ══════════════════════════════════════════════════════════════════════
618
+ # HEATMAP GENERATOR
619
+ # ══════════════════════════════════════════════════════════════════════
620
+
621
+ class HeatmapGenerator:
622
+ _COLOUR_CHANGED = np.array([255, 220, 0], dtype=np.float32)
623
+ _COLOUR_MAJOR = np.array([230, 30, 30], dtype=np.float32)
624
+ _COLOUR_ADDED = np.array([ 30, 200, 60], dtype=np.float32)
625
+ _COLOUR_DELETED = np.array([200, 30, 200], dtype=np.float32)
626
+
627
+ @staticmethod
628
+ def _project_region(r: Region, M_inv: Optional[np.ndarray], w: int, h: int) -> Tuple[int, int, int, int]:
629
+ if M_inv is not None:
630
+ corners = np.array([
631
+ [r.x, r.y ],
632
+ [r.x + r.w, r.y ],
633
+ [r.x, r.y + r.h],
634
+ [r.x + r.w, r.y + r.h],
635
+ ], dtype=np.float32)
636
+ ones = np.ones((4, 1), dtype=np.float32)
637
+ projected = (M_inv @ np.hstack([corners, ones]).T).T
638
+ x0 = int(np.clip(projected[:, 0].min(), 0, w - 1))
639
+ y0 = int(np.clip(projected[:, 1].min(), 0, h - 1))
640
+ x1 = int(np.clip(projected[:, 0].max(), 0, w - 1))
641
+ y1 = int(np.clip(projected[:, 1].max(), 0, h - 1))
642
+ else:
643
+ x0, y0, x1, y1 = r.x, r.y, r.x + r.w, r.y + r.h
644
+ return x0, y0, x1, y1
645
+
646
+ @staticmethod
647
+ def generate(
648
+ img_shape: Tuple[int, int],
649
+ matched_pairs: List[MatchedPair],
650
+ unmatched_old: List[Region],
651
+ unmatched_new: List[Region],
652
+ smooth_kernel: int = 11,
653
+ M_inv: Optional[np.ndarray] = None,
654
+ change_threshold: float = 0.05,
655
+ ) -> np.ndarray:
656
+ h, w = img_shape
657
+ layers = np.zeros((h, w, 4), dtype=np.float32)
658
+ for pair in matched_pairs:
659
+ chg = float(pair.total_change)
660
+ if chg <= change_threshold:
661
+ continue
662
+ r = pair.region_new
663
+ ch = 0 if chg <= 0.40 else 1 # yellow channel vs red channel
664
+
665
+ if pair.heatmap is not None:
666
+ # ── Grad-CAM path: paint only the pixels that actually changed ──
667
+ # pair.heatmap is (r.h, r.w) float32 in [0,1]
668
+ # Scale by total_change so brighter = more changed
669
+ cam = pair.heatmap
670
+ if cam.shape != (r.h, r.w):
671
+ cam = cv2.resize(cam, (r.w, r.h),
672
+ interpolation=cv2.INTER_LINEAR)
673
+ intensity = np.clip(cam * chg, 0.0, 1.0)
674
+ layers[r.y:r.y + r.h, r.x:r.x + r.w, ch] = np.maximum(
675
+ layers[r.y:r.y + r.h, r.x:r.x + r.w, ch], intensity)
676
+ else:
677
+ # ── Fallback: flood the whole bounding box (no Grad-CAM available) ──
678
+ layers[r.y:r.y + r.h, r.x:r.x + r.w, ch] = np.maximum(
679
+ layers[r.y:r.y + r.h, r.x:r.x + r.w, ch], chg)
680
+ # Channels 2 (added/green) and 3 (deleted/purple) intentionally omitted.
681
+ # The Heatmap tab shows only modification intensity via yellow gradient.
682
+ # Added / deleted regions are visible in the Match Canvas thermal view.
683
+ if smooth_kernel > 0:
684
+ ksize = smooth_kernel if smooth_kernel % 2 == 1 else smooth_kernel + 1
685
+ for ch in range(4):
686
+ if layers[:, :, ch].max() > 0:
687
+ layers[:, :, ch] = cv2.GaussianBlur(layers[:, :, ch], (ksize, ksize), sigmaX=3.0)
688
+ for ch in range(2):
689
+ if layers[:, :, ch].max() > 0:
690
+ layers[:, :, ch] = np.power(layers[:, :, ch], 0.6)
691
+ return layers
692
+
693
+
694
+ # ══════════════════════════════════════════════════════════════════════
695
+ # VISUALISER
696
+ # ══════════════════════════════════════════════════════════════════════
697
+
698
+ class Visualiser:
699
+ COLOURS: Dict[str, Tuple[int, int, int]] = {
700
+ "text_block": (30, 144, 255),
701
+ "figure": (255, 165, 0),
702
+ "table": (50, 205, 50),
703
+ "unknown": (180, 180, 180),
704
+ "deleted": (220, 50, 50),
705
+ "added": (50, 220, 80),
706
+ "changed": (255, 200, 0),
707
+ "unchanged": (80, 220, 80),
708
+ }
709
+
710
+ @staticmethod
711
+ def draw_alignment_check(
712
+ img_old_aligned: np.ndarray,
713
+ img_new: np.ndarray,
714
+ ) -> np.ndarray:
715
+ """
716
+ Red-cyan overlay β€” Alignment Check tab.
717
+
718
+ How to read it
719
+ --------------
720
+ OLD aligned β†’ Red channel
721
+ NEW doc β†’ Green + Blue channels (= Cyan)
722
+
723
+ β€’ Lines present at the SAME pixel in both β†’ gray (Rβ‰ˆGβ‰ˆB)
724
+ β€’ Lines in OLD that drifted β†’ RED fringe
725
+ β€’ Lines in NEW that drifted β†’ CYAN fringe
726
+ β€’ White background on both β†’ white
727
+
728
+ If the overlay looks mostly gray/white with no fringes, alignment is
729
+ good. Red/cyan colour fringes indicate residual misalignment.
730
+ """
731
+ g_old = cv2.cvtColor(img_old_aligned, cv2.COLOR_RGB2GRAY)
732
+ g_new = cv2.cvtColor(img_new, cv2.COLOR_RGB2GRAY)
733
+ # Stack: R = old, G = new, B = new β†’ cyan for new, red for old
734
+ return np.stack([g_old, g_new, g_new], axis=2)
735
+
736
+
737
+ # ══════════════════════════════════════════════════════════════════════
738
+ # HELPER β€” unmatched region visual-change check
739
+ # ══════════════════════════════════════════════════════════════════════
740
+
741
+ # Mean-abs pixel diff below this threshold β†’ region is visually identical
742
+ # despite not being paired by the matcher; excluded from the change score.
743
+ _UNMATCHED_PIXEL_THR: float = 12.0 # on 0–255 grayscale scale
744
+
745
+
746
+ def _region_mean_diff(
747
+ r: Region,
748
+ img_a: np.ndarray,
749
+ candidates: List[Region],
750
+ img_b: np.ndarray,
751
+ thumb: int = 64,
752
+ ) -> float:
753
+ """
754
+ Return the *minimum* mean-abs-diff (grayscale, 0–255) between region `r`
755
+ in `img_a` and the spatially closest candidate region in `img_b`.
756
+
757
+ "Spatially closest" = smallest Euclidean centre-to-centre distance.
758
+ If there are no candidates, return 255.0 (maximally different).
759
+ """
760
+ if not candidates:
761
+ return 255.0
762
+ pa = img_a[r.y: r.y + r.h, r.x: r.x + r.w]
763
+ if pa.size == 0:
764
+ return 255.0
765
+ ga = cv2.resize(cv2.cvtColor(pa, cv2.COLOR_RGB2GRAY), (thumb, thumb)).astype(np.float32)
766
+
767
+ cx_r, cy_r = r.center
768
+ # Sort candidates by centre distance β€” only check the 3 nearest for speed
769
+ candidates_sorted = sorted(
770
+ candidates,
771
+ key=lambda c: (c.center[0] - cx_r) ** 2 + (c.center[1] - cy_r) ** 2,
772
+ )[:3]
773
+
774
+ best = 255.0
775
+ for cand in candidates_sorted:
776
+ pb = img_b[cand.y: cand.y + cand.h, cand.x: cand.x + cand.w]
777
+ if pb.size == 0:
778
+ continue
779
+ gb = cv2.resize(
780
+ cv2.cvtColor(pb, cv2.COLOR_RGB2GRAY), (thumb, thumb)
781
+ ).astype(np.float32)
782
+ diff = float(np.mean(np.abs(ga - gb)))
783
+ if diff < best:
784
+ best = diff
785
+ return best
786
+
787
+
788
+ def _is_truly_changed(
789
+ r: Region,
790
+ candidates: List[Region],
791
+ img_a: np.ndarray,
792
+ img_b: np.ndarray,
793
+ ) -> bool:
794
+ """
795
+ Return True only when region `r` (from img_a) is visually *different*
796
+ from its nearest spatial counterpart in candidates (from img_b).
797
+
798
+ Used to distinguish "matcher failed to pair identical regions" from
799
+ "content was genuinely added or deleted."
800
+ """
801
+ return _region_mean_diff(r, img_a, candidates, img_b) >= _UNMATCHED_PIXEL_THR
802
+
803
+
804
+ # ══════════════════════════════════════════════════════════════════════
805
+ # MAIN PIPELINE
806
+ # ══════════════════════════════════════════════════════════════════════
807
+
808
+ class CoarseToFinePipeline:
809
+ def __init__(
810
+ self,
811
+ align: bool = True,
812
+ device: Optional[torch.device] = None,
813
+ region_extractor: Optional[LayoutRegionExtractor] = None,
814
+ matcher=None, # SemanticRetrievalMatcher or HungarianRegionMatcher
815
+ comparator: Optional[SiamesePatchComparator] = None,
816
+ min_similarity: float = 0.50, # used only when matcher=None (auto-build)
817
+ ):
818
+ # Resolve device once here so both sub-modules share it
819
+ if device is None:
820
+ if torch.cuda.is_available():
821
+ device = torch.device("cuda")
822
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
823
+ device = torch.device("mps")
824
+ else:
825
+ device = torch.device("cpu")
826
+ self._device = device
827
+
828
+ self.aligner = GlobalAligner() if align else None
829
+ self.extractor = region_extractor or LayoutRegionExtractor()
830
+
831
+ if matcher is not None:
832
+ # Caller supplied a custom matcher β€” use it as-is
833
+ self.matcher = matcher
834
+ self.comparator = comparator or SiamesePatchComparator(device=device)
835
+ else:
836
+ # ── Default path: shared ResNet50 encoder ──────────────
837
+ # Build the encoder once; hand the same object to both
838
+ # SemanticRetrievalMatcher (Stage 3) and SiamesePatchComparator (Stage 4).
839
+ # This halves model-load time and GPU/CPU RAM usage.
840
+ shared_encoder = _SiameseEncoder().to(device).eval()
841
+ logger.info("Pipeline: shared ResNet50 encoder on %s", device)
842
+
843
+ self.matcher = SemanticRetrievalMatcher(
844
+ encoder = shared_encoder,
845
+ device = device,
846
+ min_similarity = min_similarity,
847
+ )
848
+ self.comparator = comparator or SiamesePatchComparator(
849
+ device = device,
850
+ encoder = shared_encoder, # ← reuse, no second load
851
+ )
852
+
853
+ def compare(self, img_old: np.ndarray, img_new: np.ndarray, verbose: bool = True) -> ComparisonResult:
854
+ timings: Dict[str, float] = {}
855
+ t = time.time()
856
+ M = None
857
+ if self.aligner is not None:
858
+ img_old_aligned, M = self.aligner.align(img_old, img_new)
859
+ else:
860
+ img_old_aligned = img_old.copy()
861
+ timings["alignment"] = time.time() - t
862
+
863
+ t = time.time()
864
+ regions_old = self.extractor.extract(img_old_aligned)
865
+ regions_new = self.extractor.extract(img_new)
866
+ timings["extraction"] = time.time() - t
867
+
868
+ t = time.time()
869
+ matched, unmatched_old, unmatched_new = self.matcher.match(
870
+ regions_old, regions_new, img_old_aligned, img_new)
871
+ timings["matching"] = time.time() - t
872
+
873
+ t = time.time()
874
+ for i, pair in enumerate(matched):
875
+ matched[i] = self.comparator.compare_pair(pair, img_old_aligned, img_new)
876
+ timings["siamese"] = time.time() - t
877
+
878
+ if verbose:
879
+ logger.info("Timings β†’ align: %.2fs | extract: %.2fs | match: %.2fs | siamese: %.2fs",
880
+ timings["alignment"], timings["extraction"],
881
+ timings["matching"], timings["siamese"])
882
+
883
+ h, w = img_new.shape[:2]
884
+ # After the ORB fix, M maps OLD→NEW (forward).
885
+ # _project_region uses this matrix to map unmatched OLD region corners
886
+ # into NEW-page coordinates for heatmap rendering β€” so pass M directly,
887
+ # NOT its inverse. (Previously M mapped NEW→OLD so the inverse was
888
+ # needed; now the roles are corrected.)
889
+ heatmap = HeatmapGenerator.generate(
890
+ (h, w), matched, unmatched_old, unmatched_new,
891
+ M_inv=M, change_threshold=0.05,
892
+ )
893
+ # ── Change % calculation (two-part fix) ────────────────────────
894
+ #
895
+ # Part A β€” pixel-diff gate on unmatched regions
896
+ # Unmatched regions are NOT automatically "added/deleted".
897
+ # They may simply be regions the matcher failed to pair even though
898
+ # the content is identical. We compare each unmatched region to its
899
+ # nearest spatial counterpart in the opposite list; only those whose
900
+ # pixel diff exceeds _UNMATCHED_PIXEL_THR are counted as truly changed.
901
+ #
902
+ # Part B β€” normalise against full page area (not just detected regions)
903
+ # Using content_area as denominator collapses to 100% when all regions
904
+ # are unmatched. Using h*w gives a stable baseline independent of
905
+ # how many regions were detected or matched.
906
+
907
+ truly_deleted = [
908
+ r for r in unmatched_old
909
+ if _is_truly_changed(r, unmatched_new, img_old_aligned, img_new)
910
+ ]
911
+ truly_added = [
912
+ r for r in unmatched_new
913
+ if _is_truly_changed(r, unmatched_old, img_new, img_old_aligned)
914
+ ]
915
+
916
+ page_area = max(h * w, 1) # Part B denominator
917
+ changed_area = sum(p.region_new.area for p in matched if p.total_change > 0.05)
918
+ deleted_area = sum(r.area for r in truly_deleted)
919
+ added_area = sum(r.area for r in truly_added)
920
+ total_pct = min(100.0 * (changed_area + added_area + deleted_area) / page_area, 100.0)
921
+
922
+ return ComparisonResult(
923
+ matched_pairs=matched,
924
+ unmatched_old=unmatched_old,
925
+ unmatched_new=unmatched_new,
926
+ global_transform=M,
927
+ total_change_pct=total_pct,
928
+ heatmap=heatmap,
929
+ img_old_aligned=img_old_aligned, # ← stored for thermal overlay
930
+ )
931
+
932
+
933
+ # ══════════════════════════════════════════════════════════════════════
934
+ # GRADIO APP β€” HELPERS
935
+ # ══════════════════════════════════════════════════════════════════════
936
+
937
+ def _pick_device() -> torch.device:
938
+ if torch.cuda.is_available():
939
+ return torch.device("cuda")
940
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
941
+ return torch.device("mps")
942
+ return torch.device("cpu")
943
+
944
+
945
+ def _page_to_rgb(doc: fitz.Document, idx: int, dpi: int) -> np.ndarray:
946
+ pix = doc[idx].get_pixmap(dpi=dpi)
947
+ return np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
948
+
949
+
950
+ def _build_summary(
951
+ page_results: list,
952
+ aligned: bool,
953
+ skip_old_p1: bool = False,
954
+ skip_new_p1: bool = False,
955
+ ) -> str:
956
+ total_changes = [pr["total_change_pct"] for pr in page_results]
957
+
958
+ lines = [
959
+ "╔══════════════════════════════════════════════════════════╗",
960
+ "β•‘ POWERGRID DOCUMENT AUDIT β€” CHANGE REPORT β•‘",
961
+ "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•",
962
+ "",
963
+ f" Total Pages Analysed : {len(page_results)}",
964
+ f" Overall Avg Change : {np.mean(total_changes):.2f}%",
965
+ "",
966
+ "──────────────────────────────────────────────────────────",
967
+ " PAGE-WISE CHANGE SUMMARY",
968
+ "──────────────────────────────────────────────────────────",
969
+ ]
970
+
971
+ for pr in page_results:
972
+ pct = pr["total_change_pct"]
973
+ status = "βœ… MINIMAL" if pct < 5 else "⚠️ MODERATE" if pct < 20 else "πŸ”΄ SIGNIFICANT"
974
+ lines.append(f" Page {pr['page']:>3} β”‚ {pct:>5.1f}% β”‚ {status}")
975
+
976
+ significant = [pr["page"] for pr in page_results if pr["total_change_pct"] > 20]
977
+ if significant:
978
+ lines += [
979
+ "",
980
+ f" ⚠️ Pages with significant changes (>20%): {significant}",
981
+ ]
982
+
983
+ return "\n".join(lines)
984
+
985
+
986
+ def _build_output_pdf(page_results: list, output_path: str,
987
+ process_dpi: int = 400) -> str:
988
+ """
989
+ Build the output PDF at full pixel depth.
990
+
991
+ PyMuPDF page dimensions are in points (1 pt = 1/72 inch).
992
+ The overlay images are rendered at process_dpi. To preserve every
993
+ pixel without resampling, set the page size so that 1 image pixel = 1 pt
994
+ scaled by (72 / process_dpi):
995
+ page_width_pts = img_width_px * 72 / process_dpi
996
+ page_height_pts = img_height_px * 72 / process_dpi
997
+ insert_image() maps the image 1:1 onto the page rect, so no
998
+ downsampling or upsampling occurs β€” full pixel depth is preserved.
999
+ """
1000
+ doc_out = fitz.open()
1001
+ for pr in page_results:
1002
+ img = pr["align_check"].convert("RGB")
1003
+ px_w, px_h = img.size
1004
+ # Convert pixel dimensions to PDF points at the process DPI
1005
+ pt_w = px_w * 72.0 / process_dpi
1006
+ pt_h = px_h * 72.0 / process_dpi
1007
+ page_out = doc_out.new_page(width=pt_w, height=pt_h)
1008
+ buf = io.BytesIO()
1009
+ img.save(buf, format="PNG", optimize=True) # lossless β€” no JPEG ringing
1010
+ buf.seek(0)
1011
+ page_out.insert_image(page_out.rect, stream=buf.read())
1012
+ doc_out.save(output_path, deflate=True, garbage=4, clean=True)
1013
+ doc_out.close()
1014
+ return output_path
1015
+
1016
+
1017
+ # ══════════════════════════════════════════════════════════════════════
1018
+ # SPECIFIC-REGION HELPER β€” semantic global search in OLD document
1019
+ # ══════════════════════════════════════════════════════════════════════
1020
+
1021
+ # ImageNet normalisation reused from SemanticRetrievalMatcher
1022
+ _REGION_TRANSFORM = transforms.Compose([
1023
+ transforms.Resize((224, 224)),
1024
+ transforms.ToTensor(),
1025
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
1026
+ std=[0.229, 0.224, 0.225]),
1027
+ ])
1028
+
1029
+
1030
+ def _embed_patch(patch_rgb: np.ndarray,
1031
+ encoder: "_SiameseEncoder",
1032
+ device: torch.device) -> torch.Tensor:
1033
+ """Encode a single RGB numpy patch β†’ (128,) L2-normalised embedding."""
1034
+ t = _REGION_TRANSFORM(Image.fromarray(patch_rgb)).unsqueeze(0).to(device)
1035
+ with torch.no_grad():
1036
+ emb, _ = encoder.encode(t) # (1, 128)
1037
+ return emb[0] # (128,)
1038
+
1039
+
1040
+ def _find_matching_region_in_old(
1041
+ new_crop: np.ndarray,
1042
+ img_old_full: np.ndarray,
1043
+ encoder: "_SiameseEncoder",
1044
+ device: torch.device,
1045
+ ) -> Tuple[int, int, int, int]:
1046
+ """
1047
+ Locate where new_crop (user-selected patch from NEW page) sits inside
1048
+ img_old_full (the complete OLD page).
1049
+
1050
+ Method β€” Semantic sliding-window search
1051
+ ----------------------------------------
1052
+ 1. Encode new_crop with the shared ResNet50 encoder β†’ 128-d embedding.
1053
+ 2. Slide a window across img_old_full at multiple scales (Β±30 % of the
1054
+ crop size, preserving aspect ratio). Step = 50 % of window size so
1055
+ adjacent windows overlap and the true location is never missed.
1056
+ 3. Encode every window patch and compute cosine similarity with the
1057
+ query embedding. Pick the window with the highest similarity.
1058
+ 4. Clamp the winning box to page bounds and return it.
1059
+
1060
+ Why semantic (not pixel-level):
1061
+ β€’ ResNet50 encodes *what* is in a region (shapes, structure, symbols),
1062
+ not pixel values. Two revisions of the same table/panel/diagram will
1063
+ have near-identical embeddings even if text values changed slightly.
1064
+ β€’ Scale-invariant: the multi-scale sweep handles content that was
1065
+ enlarged or shrunk between revisions.
1066
+ β€’ Position-invariant: the full-page sweep finds content anywhere on the
1067
+ OLD page regardless of how far it moved.
1068
+
1069
+ Returns (x1, y1, x2, y2) in img_old_full pixel space.
1070
+ """
1071
+ crop_h, crop_w = new_crop.shape[:2]
1072
+ old_h, old_w = img_old_full.shape[:2]
1073
+
1074
+ def _clamp_box(bx: int, by: int, bw: int, bh: int
1075
+ ) -> Tuple[int, int, int, int]:
1076
+ bx = max(0, min(bx, old_w - 1))
1077
+ by = max(0, min(by, old_h - 1))
1078
+ bw = max(1, min(bw, old_w - bx))
1079
+ bh = max(1, min(bh, old_h - by))
1080
+ return bx, by, bx + bw, by + bh
1081
+
1082
+ # ── Step 1: encode the query (NEW crop) ──────────────────────────
1083
+ q_emb = _embed_patch(new_crop, encoder, device) # (128,)
1084
+
1085
+ # ── Step 2: build candidate windows across scales ────────────────
1086
+ # Scales relative to the crop's own size. For a 400-DPI page a crop
1087
+ # that is, say, 600 px wide is tested at 420 … 780 px widths.
1088
+ scales = (0.70, 0.85, 1.00, 1.15, 1.30)
1089
+ aspect = crop_h / max(crop_w, 1)
1090
+
1091
+ candidates: List[Tuple[int, int, int, int]] = [] # (x, y, w, h)
1092
+
1093
+ for sc in scales:
1094
+ win_w = max(32, int(crop_w * sc))
1095
+ win_h = max(32, int(crop_h * sc))
1096
+ if win_w > old_w or win_h > old_h:
1097
+ continue
1098
+ step_x = max(1, win_w // 2)
1099
+ step_y = max(1, win_h // 2)
1100
+ for y in range(0, old_h - win_h + 1, step_y):
1101
+ for x in range(0, old_w - win_w + 1, step_x):
1102
+ candidates.append((x, y, win_w, win_h))
1103
+
1104
+ logger.info(
1105
+ "_find_matching_region_in_old: %d candidate windows across %d scales",
1106
+ len(candidates), len(scales),
1107
+ )
1108
+
1109
+ if not candidates:
1110
+ # Entire crop is bigger than the old page β€” return full page
1111
+ logger.warning("_find_matching_region_in_old: crop >= page; returning full page box.")
1112
+ return _clamp_box(0, 0, old_w, old_h)
1113
+
1114
+ # ── Step 3: batch-encode all windows, find best cosine similarity ─
1115
+ # Process in mini-batches of 64 to avoid OOM on large pages.
1116
+ BATCH = 64
1117
+ best_sim: float = -1.0
1118
+ best_box: Tuple[int, int, int, int] = candidates[0]
1119
+
1120
+ for start in range(0, len(candidates), BATCH):
1121
+ batch_cands = candidates[start: start + BATCH]
1122
+ patches = []
1123
+ for (cx, cy, cw, ch) in batch_cands:
1124
+ patch = img_old_full[cy: cy + ch, cx: cx + cw]
1125
+ patches.append(patch)
1126
+
1127
+ tensors = [
1128
+ _REGION_TRANSFORM(Image.fromarray(p)) for p in patches
1129
+ ]
1130
+ batch_t = torch.stack(tensors).to(device) # (B, 3, 224, 224)
1131
+ with torch.no_grad():
1132
+ embs, _ = encoder.encode(batch_t) # (B, 128)
1133
+
1134
+ # Cosine similarity: q_emb is already L2-normed, embs are L2-normed
1135
+ sims = (embs @ q_emb).cpu().numpy() # (B,)
1136
+
1137
+ idx = int(sims.argmax())
1138
+ if sims[idx] > best_sim:
1139
+ best_sim = float(sims[idx])
1140
+ best_box = batch_cands[idx]
1141
+
1142
+ bx, by, bw, bh = best_box
1143
+ x1o, y1o, x2o, y2o = _clamp_box(bx, by, bw, bh)
1144
+
1145
+ logger.info(
1146
+ "_find_matching_region_in_old: best cosine=%.4f OLD box (%d,%d)–(%d,%d)",
1147
+ best_sim, x1o, y1o, x2o, y2o,
1148
+ )
1149
+ return (x1o, y1o, x2o, y2o)
1150
+
1151
+
1152
+ # ══════════════════════════════════════════════════════════════════════
1153
+ # CORE PROCESSING
1154
+ # ══════════════════════════════════════════════════════════════════════
1155
+
1156
+ def run_comparison(
1157
+ pdf_old_file,
1158
+ pdf_new_file,
1159
+ skip_old_p1: bool,
1160
+ skip_new_p1: bool,
1161
+ enable_align: bool,
1162
+ compare_mode: str,
1163
+ page_old_input: int,
1164
+ page_new_input: int,
1165
+ page_compare_mode: str = "Full Page",
1166
+ region_coords=None,
1167
+ display_dpi: int = 72,
1168
+ progress=gr.Progress(),
1169
+ ):
1170
+ dpi = 400 # process DPI β€” higher = more pixel depth in overlay output
1171
+
1172
+ if pdf_old_file is None or pdf_new_file is None:
1173
+ raise gr.Error("Please upload both Previous Revision and New Document PDF files.")
1174
+
1175
+ device = _pick_device()
1176
+
1177
+ pipeline = CoarseToFinePipeline(
1178
+ align = enable_align,
1179
+ device = device,
1180
+ min_similarity = 0.50,
1181
+ )
1182
+
1183
+ progress(0, desc="Opening PDF files …")
1184
+ doc_old = fitz.open(pdf_old_file.name)
1185
+ doc_new = fitz.open(pdf_new_file.name)
1186
+
1187
+ # ── Build the list of (old_page_idx, new_page_idx) pairs to process ──
1188
+ if compare_mode == "Specific Pages":
1189
+ # Convert 1-based user input to 0-based index
1190
+ old_idx_req = int(page_old_input or 1) - 1
1191
+ new_idx_req = int(page_new_input or 1) - 1
1192
+ # Clamp to valid range
1193
+ old_idx_req = max(0, min(old_idx_req, len(doc_old) - 1))
1194
+ new_idx_req = max(0, min(new_idx_req, len(doc_new) - 1))
1195
+ page_pairs = [(old_idx_req, new_idx_req)]
1196
+ else:
1197
+ # Full document mode
1198
+ old_start = 1 if skip_old_p1 else 0
1199
+ new_start = 1 if skip_new_p1 else 0
1200
+ old_pages = len(doc_old) - old_start
1201
+ new_pages = len(doc_new) - new_start
1202
+ num_pages = min(old_pages, new_pages)
1203
+
1204
+ if skip_old_p1:
1205
+ gr.Info("Skipping cover page of Previous Revision.")
1206
+ if skip_new_p1:
1207
+ gr.Info("Skipping cover page of New Document.")
1208
+ if old_pages != new_pages:
1209
+ gr.Warning(
1210
+ f"Page count mismatch: Previous Revision={old_pages}, New Document={new_pages}. "
1211
+ f"Processing {num_pages} pages."
1212
+ )
1213
+ page_pairs = [(pg + old_start, pg + new_start) for pg in range(num_pages)]
1214
+
1215
+ num_pairs = len(page_pairs)
1216
+ page_results = []
1217
+
1218
+ for i, (old_idx, new_idx) in enumerate(page_pairs):
1219
+ progress(i / num_pairs, desc=f"Processing page {i + 1} / {num_pairs} …")
1220
+ img_old = _page_to_rgb(doc_old, old_idx, dpi)
1221
+ img_new = _page_to_rgb(doc_new, new_idx, dpi)
1222
+
1223
+ # ── Normalise page dimensions before any cropping ─────────────
1224
+ # Both pages must have the same native DPI dimensions so that the
1225
+ # same pixel box selects the same physical region in both docs.
1226
+ if img_old.shape != img_new.shape:
1227
+ img_old = cv2.resize(img_old, (img_new.shape[1], img_new.shape[0]))
1228
+
1229
+ # ── Specific-region crop ──────────────────────────────────────
1230
+ # The user drew a box on the NEW-doc preview (at display_dpi).
1231
+ # Steps:
1232
+ # 1. Scale the drag coordinates from preview pixels β†’ process DPI pixels.
1233
+ # 2. Crop the same pixel box from BOTH old and new pages.
1234
+ # (Engineering drawings keep the same layout between revisions β€”
1235
+ # same position = same physical area. The ORB aligner inside
1236
+ # pipeline.compare() handles any sub-pixel drift between the two.)
1237
+ # 3. Replace img_old / img_new with the two crops β†’ overlay is
1238
+ # scoped to only the selected region.
1239
+ if (compare_mode == "Specific Pages"
1240
+ and page_compare_mode == "Specific Region"
1241
+ and region_coords):
1242
+ rx = region_coords.get("x", 0)
1243
+ ry = region_coords.get("y", 0)
1244
+ rw = region_coords.get("width", img_new.shape[1])
1245
+ rh = region_coords.get("height", img_new.shape[0])
1246
+ sf = dpi / float(display_dpi or 72) # preview px β†’ process DPI px
1247
+ x1 = max(0, int(rx * sf))
1248
+ y1 = max(0, int(ry * sf))
1249
+ x2 = min(img_new.shape[1], int((rx + rw) * sf))
1250
+ y2 = min(img_new.shape[0], int((ry + rh) * sf))
1251
+
1252
+ logger.info(
1253
+ "Specific Region: display_dpi=%d sf=%.3f "
1254
+ "preview-box (%d,%d,%d,%d) β†’ process-px (%d,%d)–(%d,%d)",
1255
+ display_dpi, sf, rx, ry, rw, rh, x1, y1, x2, y2,
1256
+ )
1257
+
1258
+ if x2 > x1 and y2 > y1:
1259
+ # Step 1 β€” crop the selected region from NEW page
1260
+ img_new_crop = img_new[y1:y2, x1:x2]
1261
+
1262
+ # Step 2 β€” semantic global search: encode the NEW crop with
1263
+ # ResNet50, slide windows over the FULL OLD page at
1264
+ # multiple scales, pick the highest cosine-similarity
1265
+ # window as the matching region in OLD.
1266
+ ox1, oy1, ox2, oy2 = _find_matching_region_in_old(
1267
+ new_crop = img_new_crop,
1268
+ img_old_full = img_old,
1269
+ encoder = pipeline.matcher.encoder,
1270
+ device = device,
1271
+ )
1272
+ logger.info(
1273
+ "Specific Region: NEW (%d,%d)–(%d,%d) β†’ OLD (%d,%d)–(%d,%d)",
1274
+ x1, y1, x2, y2, ox1, oy1, ox2, oy2,
1275
+ )
1276
+
1277
+ # Step 3 β€” crop OLD at found location; resize to exactly match
1278
+ # NEW crop so pipeline.compare() gets equal-size inputs
1279
+ img_old_raw = img_old[oy1:oy2, ox1:ox2]
1280
+ nh, nw = img_new_crop.shape[:2]
1281
+ if img_old_raw.shape[:2] != (nh, nw):
1282
+ img_old_crop = cv2.resize(
1283
+ img_old_raw, (nw, nh), interpolation=cv2.INTER_LINEAR,
1284
+ )
1285
+ else:
1286
+ img_old_crop = img_old_raw
1287
+
1288
+ # Step 4 β€” overlay is scoped to the selected region only
1289
+ img_old = img_old_crop
1290
+ img_new = img_new_crop
1291
+
1292
+ result = pipeline.compare(img_old, img_new)
1293
+
1294
+ old_aligned_for_check = (
1295
+ result.img_old_aligned if result.img_old_aligned is not None
1296
+ else img_old
1297
+ )
1298
+ align_check = Visualiser.draw_alignment_check(old_aligned_for_check, img_new)
1299
+
1300
+ page_results.append({
1301
+ "page": i + 1,
1302
+ "result": result,
1303
+ "align_check": Image.fromarray(align_check),
1304
+ "original": Image.fromarray(img_old),
1305
+ "revised": Image.fromarray(img_new),
1306
+ "total_change_pct": result.total_change_pct,
1307
+ })
1308
+
1309
+ doc_old.close()
1310
+ doc_new.close()
1311
+
1312
+ progress(0.95, desc="Generating report PDF …")
1313
+ output_pdf = _build_output_pdf(page_results, "ctf_output.pdf", process_dpi=dpi)
1314
+ summary = _build_summary(page_results, enable_align, skip_old_p1, skip_new_p1)
1315
+
1316
+ progress(1.0, desc="Done!")
1317
+ return page_results, summary, output_pdf, 1, gr.update(maximum=num_pairs, value=1)
1318
+
1319
+
1320
+ def get_page_view(page_num, pages_data, view_mode, rotation: int = 0,
1321
+ nudge_x: int = 0, nudge_y: int = 0, nudge_scale: float = 1.0):
1322
+ if not pages_data:
1323
+ return None
1324
+ idx = int(page_num) - 1
1325
+ idx = max(0, min(idx, len(pages_data) - 1))
1326
+ pr = pages_data[idx]
1327
+ key_map = {
1328
+ "Auto-Overlay": "align_check",
1329
+ "Previous Revision": "original",
1330
+ "New Document": "revised",
1331
+ }
1332
+ img = pr.get(key_map.get(view_mode, "align_check"))
1333
+ if img is None:
1334
+ return None
1335
+
1336
+ # Manual fine-tune: only applies to Auto-Overlay view
1337
+ ns = float(nudge_scale) if nudge_scale else 1.0
1338
+ if view_mode == "Auto-Overlay" and (nudge_x != 0 or nudge_y != 0 or abs(ns - 1.0) > 1e-4):
1339
+ img = _apply_nudge_overlay(pr, nudge_x, nudge_y, ns)
1340
+
1341
+ if img is not None and rotation % 360 != 0:
1342
+ img = img.rotate(rotation, expand=True)
1343
+ return img
1344
+
1345
+
1346
+ def _apply_nudge_overlay(pr: dict, dx: int, dy: int, scale: float = 1.0) -> Image.Image:
1347
+ """
1348
+ Re-render the Auto-Overlay with the NEW (red) layer shifted by (dx, dy) pixels
1349
+ and scaled by `scale` around the image centre.
1350
+
1351
+ Cyan channel stays fixed (Previous Revision aligned).
1352
+ Red channel = New Doc with nudge translate + scale applied.
1353
+ """
1354
+ if pr.get("align_check") is None:
1355
+ return None
1356
+
1357
+ # Extract channels from the stored align_check image
1358
+ align_check_arr = np.array(pr["align_check"].convert("RGB"))
1359
+ g_old_aligned = align_check_arr[:, :, 0] # cyan source (Previous Revision)
1360
+ g_new_orig = align_check_arr[:, :, 1] # red source (New Doc)
1361
+
1362
+ h, w = g_old_aligned.shape
1363
+ cx, cy = w / 2.0, h / 2.0
1364
+
1365
+ # Build combined affine: scale about centre + translate
1366
+ # M = T(cx,cy) Β· S(scale) Β· T(-cx,-cy) Β· T(dx,dy)
1367
+ scale = float(scale) if scale and scale > 0 else 1.0
1368
+ # Combined 2Γ—3 affine matrix
1369
+ M = np.float32([
1370
+ [scale, 0, dx + cx * (1 - scale)],
1371
+ [0, scale, dy + cy * (1 - scale)],
1372
+ ])
1373
+
1374
+ g_new_transformed = cv2.warpAffine(
1375
+ g_new_orig, M, (w, h),
1376
+ flags=cv2.INTER_LINEAR,
1377
+ borderMode=cv2.BORDER_CONSTANT,
1378
+ borderValue=255,
1379
+ )
1380
+
1381
+ # Stack: R=old_aligned (cyan base), G=new_transformed, B=new_transformed (β†’ red fringe)
1382
+ overlay = np.stack([g_old_aligned, g_new_transformed, g_new_transformed], axis=2)
1383
+ return Image.fromarray(overlay.astype(np.uint8))
1384
+
1385
+
1386
+ # ══════════════════════════════════════════════════════════════════════
1387
+ # GRADIO UI
1388
+ # ══════════════════════════════════════════════════════════════════════
1389
+
1390
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "styles.css"),
1391
+ encoding="utf-8") as _css_f:
1392
+ _CSS = _css_f.read()
1393
+
1394
+ _THEME = gr.themes.Base(
1395
+ primary_hue=gr.themes.colors.blue,
1396
+ neutral_hue=gr.themes.colors.gray,
1397
+ font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
1398
+ )
1399
+
1400
+ # Gradio 6+: theme & css are passed to launch(), not Blocks()
1401
+ with gr.Blocks(title="POWERGRID Document Auditor") as demo:
1402
+
1403
+ # ── Header ─────────────────────────────────────────────────────────
1404
+ _logo_tag = (
1405
+ f'<img src="{_LOGO_URI}" alt="POWERGRID Logo" />'
1406
+ if _LOGO_URI else
1407
+ '<span style="font-size:1.4rem;font-weight:900;color:#003087;letter-spacing:-1px;">PG</span>'
1408
+ )
1409
+ gr.HTML(f"""
1410
+ <div id="app-header">
1411
+ <div id="app-header-inner">
1412
+ <div id="app-header-logo">{_logo_tag}</div>
1413
+ <div id="app-header-text">
1414
+ <h1>POWERGRID Document Auditor</h1>
1415
+ <p>Power Grid Corporation of India Limited &nbsp;&mdash;&nbsp; AI-Powered Document Comparison</p>
1416
+ </div>
1417
+ </div>
1418
+ </div>
1419
+ """)
1420
+
1421
+ # (JS injected via demo.load below β€” see end of Blocks context)
1422
+
1423
+ # ── Shared State ───────────────────────────────────────────────────
1424
+ pages_state = gr.State(value=None)
1425
+ rotation_state = gr.State(value=0)
1426
+ nudge_x_state = gr.State(value=0) # manual X offset for red (New Doc) layer
1427
+ nudge_y_state = gr.State(value=0) # manual Y offset for red (New Doc) layer
1428
+ nudge_scale_state = gr.State(value=1.0) # manual scale for red (New Doc) layer
1429
+ region_coords_state = gr.State(value=None) # {x,y,width,height} in preview px; None = full page
1430
+ display_dpi_state = gr.State(value=72) # DPI used when rendering the region preview
1431
+
1432
+ # ── Layout ─────────────────────────────────────────────────────────
1433
+ with gr.Row(equal_height=False):
1434
+
1435
+ # ════════════════════════════════════════════════════════════
1436
+ # LEFT PANE β€” inputs
1437
+ # ════════════════════════════════════════════════════════════
1438
+ with gr.Column(scale=1, min_width=290, elem_id="left-panel"):
1439
+
1440
+ gr.HTML('<div class="section-label">Documents</div>')
1441
+ pdf_old = gr.File(label="Previous Revision PDF", file_types=[".pdf"])
1442
+ skip_old_p1 = gr.Checkbox(
1443
+ value=False,
1444
+ label="Skip cover page of Previous Revision",
1445
+ interactive=False,
1446
+ elem_classes=["skip-cb"],
1447
+ )
1448
+
1449
+ gr.HTML('<div class="section-divider"></div>')
1450
+ pdf_new = gr.File(label="Revised (New) PDF", file_types=[".pdf"])
1451
+ skip_new_p1 = gr.Checkbox(
1452
+ value=False,
1453
+ label="Skip cover page of New Revision",
1454
+ interactive=False,
1455
+ elem_classes=["skip-cb"],
1456
+ )
1457
+
1458
+ gr.HTML('<div class="section-divider"></div>')
1459
+ gr.HTML('<div class="section-label">Options</div>')
1460
+ enable_align = gr.Checkbox(
1461
+ value=True,
1462
+ label="Auto-align pages before comparing",
1463
+ info="Enable if documents were scanned or printed at different positions or scales.",
1464
+ )
1465
+
1466
+ gr.HTML('<div class="section-divider"></div>')
1467
+ gr.HTML('<div class="section-label">Compare Mode</div>')
1468
+ compare_mode = gr.Radio(
1469
+ choices=["Full Document", "Specific Pages"],
1470
+ value="Full Document",
1471
+ label="Compare Mode",
1472
+ show_label=False,
1473
+ elem_id="compare-mode-radio",
1474
+ )
1475
+ with gr.Row(visible=False, elem_id="specific-pages-row") as specific_pages_row:
1476
+ page_old_input = gr.Number(
1477
+ value=1, minimum=1, step=1, precision=0,
1478
+ label="Prev. Revision Page",
1479
+ elem_id="page-old-input",
1480
+ )
1481
+ page_new_input = gr.Number(
1482
+ value=1, minimum=1, step=1, precision=0,
1483
+ label="New Document Page",
1484
+ elem_id="page-new-input",
1485
+ )
1486
+
1487
+ # Sub-options shown when "Specific Pages" is selected
1488
+ with gr.Column(visible=False, elem_id="region-col") as region_col:
1489
+ page_compare_mode = gr.Radio(
1490
+ choices=["Full Page", "Specific Region"],
1491
+ value="Full Page",
1492
+ label="Page Comparison",
1493
+ show_label=True,
1494
+ elem_id="page-compare-mode-radio",
1495
+ )
1496
+
1497
+ # Region selection β€” gr.Image shows the page; canvas overlay captures bbox drag
1498
+ with gr.Column(visible=False, elem_id="region-preview-col") as region_preview_col:
1499
+ region_readout = gr.HTML(
1500
+ value='<div id="region-readout">No region selected β€” full page will be used</div>',
1501
+ elem_id="region-readout",
1502
+ )
1503
+ # gr.Image: Python pushes the page PIL image here (always visible in DOM)
1504
+ region_page_img = gr.Image(
1505
+ value=None,
1506
+ label=None,
1507
+ show_label=False,
1508
+ type="pil",
1509
+ interactive=False,
1510
+ elem_id="region-page-img",
1511
+ height=380,
1512
+ )
1513
+ # Coords textbox: JS→Python bridge — visible but CSS-collapsed to 0px
1514
+ region_coords_txt = gr.Textbox(
1515
+ value="",
1516
+ label=None,
1517
+ show_label=False,
1518
+ elem_id="region-coords-txt",
1519
+ elem_classes=["region-coords-hidden"],
1520
+ )
1521
+ clear_region_btn = gr.Button(
1522
+ "βœ• Clear Region",
1523
+ size="sm",
1524
+ elem_id="clear-region-btn",
1525
+ )
1526
+
1527
+ gr.HTML('<div class="section-divider"></div>')
1528
+ run_btn = gr.Button("Run Audit", variant="primary", size="lg", elem_id="run-btn")
1529
+
1530
+ gr.HTML('<div class="section-divider"></div>')
1531
+ gr.HTML('<div class="section-label">Fine-Tune Alignment</div>')
1532
+
1533
+ # ── MacBook-style arrow key D-pad ─────────────────────────
1534
+ # Row 1: [ β–² ] (centred, half-row)
1535
+ with gr.Row(equal_height=True, elem_id="nudge-row-top"):
1536
+ gr.HTML('<div style="flex:1;min-width:0"></div>')
1537
+ nudge_up_btn = gr.Button("β–²", elem_id="nudge-up", min_width=44, scale=0)
1538
+ gr.HTML('<div style="flex:1;min-width:0"></div>')
1539
+
1540
+ # Row 2: [ β—€ ][ β–Ό ][ β–Ά ]
1541
+ with gr.Row(equal_height=True, elem_id="nudge-row-bot"):
1542
+ nudge_left_btn = gr.Button("β—€", elem_id="nudge-left", min_width=44, scale=0)
1543
+ nudge_down_btn = gr.Button("β–Ό", elem_id="nudge-down", min_width=44, scale=0)
1544
+ nudge_right_btn = gr.Button("β–Ά", elem_id="nudge-right", min_width=44, scale=0)
1545
+
1546
+ gr.HTML('<p class="nudge-tip">Tip: Run Audit resets alignment</p>')
1547
+
1548
+ nudge_step = gr.Number(
1549
+ value=1, minimum=1, maximum=100, step=1,
1550
+ label="Step Size (px)", precision=0,
1551
+ elem_id="nudge-step",
1552
+ )
1553
+ nudge_scale = gr.Number(
1554
+ value=1.0, minimum=0.10, maximum=10.0, step=0.005,
1555
+ label="Scale β€” Red Layer", precision=3,
1556
+ elem_id="nudge-scale",
1557
+ )
1558
+ nudge_readout = gr.HTML(
1559
+ value='<div id="nudge-readout-wrap">x&nbsp;=&nbsp;+0 px<br>y&nbsp;=&nbsp;+0 px<br>scale&nbsp;=&nbsp;1.000</div>',
1560
+ elem_id="nudge-readout",
1561
+ )
1562
+
1563
+ # ════════════════════════════════════════════════════════════
1564
+ # RIGHT PANE β€” results
1565
+ # ════════════════════════════════════════════════════════════
1566
+ with gr.Column(scale=3, elem_id="right-panel"):
1567
+
1568
+ # ── Toolbar: view tabs | rotation buttons ──
1569
+ with gr.Row(elem_id="toolbar-row"):
1570
+ view_mode = gr.Radio(
1571
+ choices=["Auto-Overlay", "Previous Revision", "New Document"],
1572
+ value="Auto-Overlay",
1573
+ label="View",
1574
+ show_label=False,
1575
+ scale=1,
1576
+ min_width=320,
1577
+ elem_id="view-mode-radio",
1578
+ )
1579
+ gr.HTML('<div class="toolbar-sep"></div>')
1580
+ rot_left_btn = gr.Button("β†Ί", scale=0, elem_id="rot-left", min_width=38)
1581
+ rot_right_btn = gr.Button("↻", scale=0, elem_id="rot-right", min_width=38)
1582
+
1583
+ # ── Page slider (shown only after audit runs) ──────────────
1584
+ page_slider = gr.Slider(
1585
+ minimum=1, maximum=1, value=1, step=1,
1586
+ label="Page",
1587
+ visible=False,
1588
+ elem_id="page-slider",
1589
+ )
1590
+
1591
+ # Hidden state
1592
+ page_num_state = gr.State(value=1)
1593
+ total_pages_state = gr.State(value=1)
1594
+
1595
+ result_image = gr.Image(
1596
+ label="",
1597
+ type="pil",
1598
+ height=720,
1599
+ interactive=False,
1600
+ elem_id="result-image",
1601
+ )
1602
+
1603
+ gr.HTML("""
1604
+ <div id="legend-bar" style="display:flex; gap:18px; flex-wrap:wrap; align-items:center;">
1605
+ <span style="font-size:0.60rem;font-weight:700;color:#8BA0BB;text-transform:uppercase;
1606
+ letter-spacing:0.11em;white-space:nowrap;flex-shrink:0;">Overlay Legend</span>
1607
+ <span style="display:flex;align-items:center;gap:6px;">
1608
+ <span style="width:12px;height:12px;border-radius:3px;background:#7A7A7A;
1609
+ flex-shrink:0;display:inline-block;box-shadow:0 1px 2px rgba(0,0,0,0.15);"></span>
1610
+ <span style="font-size:0.75rem;color:#4A6585;white-space:nowrap;">
1611
+ <b style="color:#0F1C2E;font-weight:600;">Gray</b>&nbsp;&mdash;&nbsp;Unchanged</span>
1612
+ </span>
1613
+ <span style="display:flex;align-items:center;gap:6px;">
1614
+ <span style="width:12px;height:12px;border-radius:3px;background:#00BBBB;
1615
+ flex-shrink:0;display:inline-block;box-shadow:0 1px 2px rgba(0,0,0,0.15);"></span>
1616
+ <span style="font-size:0.75rem;color:#4A6585;white-space:nowrap;">
1617
+ <b style="color:#007070;font-weight:600;">Cyan</b>&nbsp;&mdash;&nbsp;Previous Revision</span>
1618
+ </span>
1619
+ <span style="display:flex;align-items:center;gap:6px;">
1620
+ <span style="width:12px;height:12px;border-radius:3px;background:#EE3333;
1621
+ flex-shrink:0;display:inline-block;box-shadow:0 1px 2px rgba(0,0,0,0.15);"></span>
1622
+ <span style="font-size:0.75rem;color:#4A6585;white-space:nowrap;">
1623
+ <b style="color:#BB0000;font-weight:600;">Red</b>&nbsp;&mdash;&nbsp;New Document</span>
1624
+ </span>
1625
+ </div>
1626
+ """)
1627
+
1628
+ with gr.Row():
1629
+ pdf_output = gr.File(label="⬇️ Download Result PDF")
1630
+
1631
+ # ══════════════════════════════════════════════════════════════════
1632
+ # EVENT HANDLERS
1633
+ # ══════════════════════════════════════════════════════════════════
1634
+
1635
+ def on_pdf_upload(pdf_file):
1636
+ """Disable skip-cover-page checkbox when uploaded PDF has only 1 page."""
1637
+ if pdf_file is None:
1638
+ return gr.update(interactive=False, value=False)
1639
+ try:
1640
+ doc = fitz.open(pdf_file.name)
1641
+ n = len(doc)
1642
+ doc.close()
1643
+ if n <= 1:
1644
+ return gr.update(interactive=False, value=False)
1645
+ else:
1646
+ return gr.update(interactive=True)
1647
+ except Exception:
1648
+ return gr.update(interactive=True)
1649
+
1650
+ def _readout_html(nx: int, ny: int, ns: float) -> str:
1651
+ return (
1652
+ f'<div id="nudge-readout-wrap">'
1653
+ f'x&nbsp;=&nbsp;{nx:+d}&thinsp;px<br>'
1654
+ f'y&nbsp;=&nbsp;{ny:+d}&thinsp;px<br>'
1655
+ f'scale&nbsp;=&nbsp;{ns:.3f}'
1656
+ f'</div>'
1657
+ )
1658
+
1659
+ def on_compare_mode_change(mode):
1660
+ """Show/hide the specific-page number inputs and region sub-options."""
1661
+ show = (mode == "Specific Pages")
1662
+ return gr.update(visible=show), gr.update(visible=show)
1663
+
1664
+ def on_load_preview(pdf_new_f, pg_new):
1665
+ """Render the New Doc page at 72 DPI and return as PIL image for inline display."""
1666
+ if pdf_new_f is None:
1667
+ raise gr.Error("Please upload the New Document PDF first.")
1668
+ preview_dpi = 72
1669
+ doc = fitz.open(pdf_new_f.name)
1670
+ idx = max(0, int(pg_new or 1) - 1)
1671
+ idx = min(idx, len(doc) - 1)
1672
+ arr = _page_to_rgb(doc, idx, preview_dpi)
1673
+ doc.close()
1674
+ pil_img = Image.fromarray(arr)
1675
+ readout = '<div id="region-readout">Draw a box on the image below to select a region</div>'
1676
+ # returns: pil_img, coords_txt_reset, coords_state_reset, display_dpi, readout
1677
+ return pil_img, "", None, preview_dpi, readout
1678
+
1679
+ def on_region_coords_change(coords_txt):
1680
+ """Parse 'x,y,w,h' string written by JS canvas into region_coords_state dict."""
1681
+ if not coords_txt or coords_txt.strip() == "":
1682
+ return None, '<div id="region-readout">No region selected β€” full page will be used</div>'
1683
+ try:
1684
+ parts = [float(v) for v in coords_txt.strip().split(",")]
1685
+ x, y, w, h = int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3])
1686
+ if w < 5 or h < 5:
1687
+ return None, '<div id="region-readout">Region too small β€” drag a larger area</div>'
1688
+ coords = {"x": x, "y": y, "width": w, "height": h}
1689
+ readout = (
1690
+ f'<div id="region-readout">'
1691
+ f'βœ… Region: ({x}, {y}) β†’ ({x+w}, {y+h})'
1692
+ f'&nbsp;|&nbsp;{w}&times;{h} px'
1693
+ f'</div>'
1694
+ )
1695
+ return coords, readout
1696
+ except Exception:
1697
+ return None, '<div id="region-readout">Invalid region β€” drag again</div>'
1698
+
1699
+ def on_clear_region():
1700
+ """Reset region β€” clear coords textbox and state (image stays, JS clears the overlay)."""
1701
+ return "", None, '<div id="region-readout">Draw a box on the image below to select a region</div>'
1702
+
1703
+ def on_run(pdf_old_f, pdf_new_f, skip_old, skip_new, align,
1704
+ cmp_mode, pg_old, pg_new,
1705
+ pg_cmp_mode, region_coords, display_dpi,
1706
+ progress=gr.Progress()):
1707
+ page_results, _summary, pdf_path, _, _ = run_comparison(
1708
+ pdf_old_f, pdf_new_f, skip_old, skip_new, align,
1709
+ cmp_mode, pg_old, pg_new,
1710
+ pg_cmp_mode, region_coords, display_dpi,
1711
+ progress
1712
+ )
1713
+ n_pages = len(page_results)
1714
+ first_img = page_results[0]["align_check"] if page_results else None
1715
+ return (
1716
+ page_results,
1717
+ 0, # rotation reset
1718
+ 0, # nudge_x reset
1719
+ 0, # nudge_y reset
1720
+ 1.0, # nudge_scale reset
1721
+ 1, # page_num reset to 1
1722
+ n_pages,# total_pages
1723
+ pdf_path,
1724
+ first_img,
1725
+ _readout_html(0, 0, 1.0),
1726
+ gr.update(visible=n_pages > 1, minimum=1, maximum=n_pages, value=1),
1727
+ )
1728
+
1729
+ def on_view_change(view, pg, total, pages_data, rot, nx, ny, ns):
1730
+ return get_page_view(pg, pages_data, view, 0, nx, ny, ns), 0
1731
+
1732
+ def on_rot_left(pg, total, pages_data, view, rot, nx, ny, ns):
1733
+ new_rot = (rot + 90) % 360
1734
+ return get_page_view(pg, pages_data, view, new_rot, nx, ny, ns), new_rot
1735
+
1736
+ def on_rot_right(pg, total, pages_data, view, rot, nx, ny, ns):
1737
+ new_rot = (rot - 90) % 360
1738
+ return get_page_view(pg, pages_data, view, new_rot, nx, ny, ns), new_rot
1739
+
1740
+ def on_pg_slide(pg, total, pages_data, view, rot, nx, ny, ns):
1741
+ pg = int(pg or 1)
1742
+ img = get_page_view(pg, pages_data, view, rot, nx, ny, ns)
1743
+ return img, pg
1744
+
1745
+ # ── Nudge handlers (arrow buttons + scale change) ─────────────────
1746
+ def on_nudge(direction: str, pg, total, pages_data, view, rot, nx, ny, ns, step):
1747
+ step = int(step or 1)
1748
+ if direction == "left": nx -= step
1749
+ elif direction == "right": nx += step
1750
+ elif direction == "up": ny -= step
1751
+ elif direction == "down": ny += step
1752
+ img = get_page_view(pg, pages_data, view, rot, nx, ny, ns)
1753
+ return img, nx, ny, ns, _readout_html(nx, ny, ns)
1754
+
1755
+ def on_scale_change(sc, pg, total, pages_data, view, rot, nx, ny):
1756
+ ns = float(sc) if sc else 1.0
1757
+ img = get_page_view(pg, pages_data, view, rot, nx, ny, ns)
1758
+ return img, ns, _readout_html(nx, ny, ns)
1759
+
1760
+ pdf_old.change(fn=on_pdf_upload, inputs=[pdf_old], outputs=[skip_old_p1])
1761
+ pdf_new.change(fn=on_pdf_upload, inputs=[pdf_new], outputs=[skip_new_p1])
1762
+
1763
+ # Show / hide specific-page inputs and region sub-options when compare mode changes
1764
+ compare_mode.change(
1765
+ fn=on_compare_mode_change,
1766
+ inputs=[compare_mode],
1767
+ outputs=[specific_pages_row, region_col],
1768
+ )
1769
+
1770
+ # Show / hide the region preview block AND auto-load the preview
1771
+ # _preview_outputs: [region_page_img, region_coords_txt, coords_state, display_dpi_state, region_readout]
1772
+ _preview_outputs = [region_page_img, region_coords_txt,
1773
+ region_coords_state, display_dpi_state, region_readout]
1774
+
1775
+ def on_page_compare_mode_change(sub_mode, pdf_new_f, pg_new):
1776
+ show = (sub_mode == "Specific Region")
1777
+ col_update = gr.update(visible=show)
1778
+ if show:
1779
+ try:
1780
+ pil_img, ctxt, coords, dpi, rdout = on_load_preview(pdf_new_f, pg_new)
1781
+ return col_update, pil_img, ctxt, coords, dpi, rdout
1782
+ except Exception:
1783
+ pass
1784
+ blank_readout = '<div id="region-readout">No region selected β€” full page will be used</div>'
1785
+ return col_update, None, "", None, 72, blank_readout
1786
+
1787
+ page_compare_mode.change(
1788
+ fn=on_page_compare_mode_change,
1789
+ inputs=[page_compare_mode, pdf_new, page_new_input],
1790
+ outputs=[region_preview_col] + _preview_outputs,
1791
+ )
1792
+
1793
+ # Re-load preview when the New Doc page number changes (if Specific Region is active)
1794
+ def on_page_new_change(pg_new, pdf_new_f, sub_mode):
1795
+ if sub_mode == "Specific Region" and pdf_new_f is not None:
1796
+ try:
1797
+ return on_load_preview(pdf_new_f, pg_new)
1798
+ except Exception:
1799
+ pass
1800
+ blank_readout = '<div id="region-readout">No region selected β€” full page will be used</div>'
1801
+ return None, "", None, 72, blank_readout
1802
+
1803
+ page_new_input.change(
1804
+ fn=on_page_new_change,
1805
+ inputs=[page_new_input, pdf_new, page_compare_mode],
1806
+ outputs=_preview_outputs,
1807
+ )
1808
+
1809
+ # JS canvas overlay writes "x,y,w,h" into region_coords_txt when drag ends β†’ parse to dict
1810
+ region_coords_txt.change(
1811
+ fn=on_region_coords_change,
1812
+ inputs=[region_coords_txt],
1813
+ outputs=[region_coords_state, region_readout],
1814
+ show_progress="hidden",
1815
+ show_progress_on=[],
1816
+ )
1817
+
1818
+ # Clear region button β€” clear coords, JS overlay self-clears on next poll
1819
+ clear_region_btn.click(
1820
+ fn=on_clear_region,
1821
+ inputs=None,
1822
+ outputs=[region_coords_txt, region_coords_state, region_readout],
1823
+ )
1824
+
1825
+ run_btn.click(
1826
+ fn=on_run,
1827
+ inputs=[pdf_old, pdf_new, skip_old_p1, skip_new_p1, enable_align,
1828
+ compare_mode, page_old_input, page_new_input,
1829
+ page_compare_mode, region_coords_state, display_dpi_state],
1830
+ outputs=[pages_state, rotation_state, nudge_x_state, nudge_y_state, nudge_scale_state,
1831
+ page_num_state, total_pages_state,
1832
+ pdf_output, result_image, nudge_readout, page_slider],
1833
+ )
1834
+
1835
+ # View-mode tab change
1836
+ view_mode.change(
1837
+ fn=on_view_change,
1838
+ inputs=[view_mode, page_num_state, total_pages_state, pages_state, rotation_state,
1839
+ nudge_x_state, nudge_y_state, nudge_scale_state],
1840
+ outputs=[result_image, rotation_state],
1841
+ show_progress="hidden",
1842
+ show_progress_on=[],
1843
+ )
1844
+
1845
+ # Rotation buttons
1846
+ rot_left_btn.click(
1847
+ fn=on_rot_left,
1848
+ inputs=[page_num_state, total_pages_state, pages_state, view_mode, rotation_state,
1849
+ nudge_x_state, nudge_y_state, nudge_scale_state],
1850
+ outputs=[result_image, rotation_state],
1851
+ show_progress="hidden",
1852
+ show_progress_on=[],
1853
+ )
1854
+ rot_right_btn.click(
1855
+ fn=on_rot_right,
1856
+ inputs=[page_num_state, total_pages_state, pages_state, view_mode, rotation_state,
1857
+ nudge_x_state, nudge_y_state, nudge_scale_state],
1858
+ outputs=[result_image, rotation_state],
1859
+ show_progress="hidden",
1860
+ show_progress_on=[],
1861
+ )
1862
+
1863
+ # Page slider
1864
+ page_slider.change(
1865
+ fn=on_pg_slide,
1866
+ inputs=[page_slider, total_pages_state, pages_state, view_mode,
1867
+ rotation_state, nudge_x_state, nudge_y_state, nudge_scale_state],
1868
+ outputs=[result_image, page_num_state],
1869
+ show_progress="hidden",
1870
+ show_progress_on=[],
1871
+ )
1872
+
1873
+ # ── Nudge arrow buttons ───────────────────────────────────────────
1874
+ _nudge_inputs = [page_num_state, total_pages_state, pages_state, view_mode, rotation_state,
1875
+ nudge_x_state, nudge_y_state, nudge_scale_state, nudge_step]
1876
+ _nudge_outputs = [result_image, nudge_x_state, nudge_y_state,
1877
+ nudge_scale_state, nudge_readout]
1878
+
1879
+ nudge_left_btn.click(
1880
+ fn=lambda *a: on_nudge("left", *a), inputs=_nudge_inputs, outputs=_nudge_outputs,
1881
+ show_progress="hidden", show_progress_on=[])
1882
+ nudge_right_btn.click(
1883
+ fn=lambda *a: on_nudge("right", *a), inputs=_nudge_inputs, outputs=_nudge_outputs,
1884
+ show_progress="hidden", show_progress_on=[])
1885
+ nudge_up_btn.click(
1886
+ fn=lambda *a: on_nudge("up", *a), inputs=_nudge_inputs, outputs=_nudge_outputs,
1887
+ show_progress="hidden", show_progress_on=[])
1888
+ nudge_down_btn.click(
1889
+ fn=lambda *a: on_nudge("down", *a), inputs=_nudge_inputs, outputs=_nudge_outputs,
1890
+ show_progress="hidden", show_progress_on=[])
1891
+
1892
+ # ── Scale number input (live update on change) ────────────────────
1893
+ nudge_scale.change(
1894
+ fn=on_scale_change,
1895
+ inputs=[nudge_scale, page_num_state, total_pages_state, pages_state, view_mode,
1896
+ rotation_state, nudge_x_state, nudge_y_state],
1897
+ outputs=[result_image, nudge_scale_state, nudge_readout],
1898
+ show_progress="hidden",
1899
+ show_progress_on=[],
1900
+ )
1901
+
1902
+ # ── Inline canvas JS β€” overlays a transparent draw canvas on the gr.Image ──
1903
+ _INLINE_CANVAS_JS = """
1904
+ () => {
1905
+ let _overlay = null, _ctx = null;
1906
+ let _dragging = false, _sx = 0, _sy = 0, _sel = null;
1907
+ let _lastCoords = '';
1908
+
1909
+ function getImgEl() {
1910
+ // The rendered <img> inside the gr.Image component
1911
+ const wrap = document.getElementById('region-page-img');
1912
+ return wrap ? wrap.querySelector('img') : null;
1913
+ }
1914
+
1915
+ function getCoordsEl() {
1916
+ const wrap = document.getElementById('region-coords-txt');
1917
+ return wrap ? wrap.querySelector('textarea') : null;
1918
+ }
1919
+
1920
+ function syncOverlay() {
1921
+ if (!_overlay) return;
1922
+ const img = getImgEl();
1923
+ if (!img || !img.src || img.src.startsWith('data:image/gif')) return;
1924
+ const r = img.getBoundingClientRect();
1925
+ const pr = img.parentElement.getBoundingClientRect();
1926
+ _overlay.style.left = (r.left - pr.left) + 'px';
1927
+ _overlay.style.top = (r.top - pr.top) + 'px';
1928
+ _overlay.style.width = r.width + 'px';
1929
+ _overlay.style.height = r.height + 'px';
1930
+ if (_overlay.width !== Math.round(r.width) || _overlay.height !== Math.round(r.height)) {
1931
+ _overlay.width = Math.round(r.width);
1932
+ _overlay.height = Math.round(r.height);
1933
+ redraw();
1934
+ }
1935
+ }
1936
+
1937
+ function toCanvas(cx, cy) {
1938
+ const r = _overlay.getBoundingClientRect();
1939
+ return { x: (cx - r.left) * _overlay.width / r.width,
1940
+ y: (cy - r.top) * _overlay.height / r.height };
1941
+ }
1942
+
1943
+ function redraw() {
1944
+ if (!_ctx || !_overlay.width) return;
1945
+ _ctx.clearRect(0, 0, _overlay.width, _overlay.height);
1946
+ if (_sel) {
1947
+ _ctx.strokeStyle = '#00BBBB';
1948
+ _ctx.lineWidth = Math.max(2, _overlay.width / 400);
1949
+ _ctx.strokeRect(_sel.x, _sel.y, _sel.w, _sel.h);
1950
+ _ctx.fillStyle = 'rgba(0,187,187,0.15)';
1951
+ _ctx.fillRect(_sel.x, _sel.y, _sel.w, _sel.h);
1952
+ }
1953
+ }
1954
+
1955
+ function pushCoords() {
1956
+ const el = getCoordsEl();
1957
+ if (!el || !_sel) return;
1958
+ // Scale from display px back to natural image px
1959
+ const img = getImgEl();
1960
+ if (!img) return;
1961
+ const scaleX = img.naturalWidth / _overlay.width;
1962
+ const scaleY = img.naturalHeight / _overlay.height;
1963
+ const val = Math.round(_sel.x * scaleX) + ',' +
1964
+ Math.round(_sel.y * scaleY) + ',' +
1965
+ Math.round(_sel.w * scaleX) + ',' +
1966
+ Math.round(_sel.h * scaleY);
1967
+ const setter = Object.getOwnPropertyDescriptor(HTMLTextAreaElement.prototype, 'value').set;
1968
+ setter.call(el, val);
1969
+ el.dispatchEvent(new Event('input', { bubbles: true }));
1970
+ }
1971
+
1972
+ function setupOverlay() {
1973
+ const imgWrap = document.getElementById('region-page-img');
1974
+ if (!imgWrap) return false;
1975
+ // Make sure parent is positioned
1976
+ const parent = imgWrap.querySelector('.image-container') || imgWrap;
1977
+ if (getComputedStyle(parent).position === 'static') parent.style.position = 'relative';
1978
+
1979
+ if (!_overlay) {
1980
+ _overlay = document.createElement('canvas');
1981
+ _overlay.id = 'region-draw-overlay';
1982
+ _overlay.style.cssText = 'position:absolute;top:0;left:0;cursor:crosshair;z-index:10;pointer-events:all;';
1983
+ parent.appendChild(_overlay);
1984
+ _ctx = _overlay.getContext('2d');
1985
+
1986
+ _overlay.addEventListener('mousedown', function(e) {
1987
+ const p = toCanvas(e.clientX, e.clientY);
1988
+ _sx = p.x; _sy = p.y; _sel = null; _dragging = true; e.preventDefault();
1989
+ });
1990
+ _overlay.addEventListener('mousemove', function(e) {
1991
+ if (!_dragging) return;
1992
+ const p = toCanvas(e.clientX, e.clientY);
1993
+ _sel = { x: Math.min(_sx, p.x), y: Math.min(_sy, p.y),
1994
+ w: Math.abs(p.x - _sx), h: Math.abs(p.y - _sy) };
1995
+ redraw(); e.preventDefault();
1996
+ });
1997
+ _overlay.addEventListener('mouseup', function(e) {
1998
+ if (!_dragging) return; _dragging = false;
1999
+ if (!_sel || _sel.w < 5 || _sel.h < 5) { _sel = null; redraw(); return; }
2000
+ redraw(); pushCoords(); e.preventDefault();
2001
+ });
2002
+ }
2003
+ return true;
2004
+ }
2005
+
2006
+ // Poll every 300ms: sync overlay size, watch for cleared coords
2007
+ setInterval(function() {
2008
+ setupOverlay();
2009
+ syncOverlay();
2010
+
2011
+ // Clear overlay when coords textbox is wiped by Clear button
2012
+ const el = getCoordsEl();
2013
+ if (el) {
2014
+ const cur = el.value;
2015
+ if (cur !== _lastCoords) {
2016
+ _lastCoords = cur;
2017
+ if (cur === '') { _sel = null; redraw(); }
2018
+ }
2019
+ }
2020
+ }, 300);
2021
+ }
2022
+ """
2023
+ demo.load(fn=None, js=_INLINE_CANVAS_JS)
2024
+
2025
+
2026
+ # ══════════════════════════════════════════════════════════════════════
2027
+ # ENTRY POINT
2028
+ # ══════════════════════════════════════════════════════════════════════
2029
+
2030
+ if __name__ == "__main__":
2031
+ import socket as _socket
2032
+ def _find_free_port(start: int = 7860, end: int = 7880) -> int:
2033
+ for p in range(start, end + 1):
2034
+ with _socket.socket(_socket.AF_INET, _socket.SOCK_STREAM) as s:
2035
+ try:
2036
+ s.bind(("", p))
2037
+ return p
2038
+ except OSError:
2039
+ continue
2040
+ return start # fallback β€” Gradio will error with a clear message
2041
+
2042
+ _port = _find_free_port()
2043
+ print(f"\nπŸš€ POWERGRID Document Auditor β†’ http://localhost:{_port}\n")
2044
+ demo.queue(default_concurrency_limit=20).launch(
2045
+ server_name="0.0.0.0",
2046
+ server_port=_port,
2047
+ share=False,
2048
+ show_error=True,
2049
+ theme=_THEME,
2050
+ css=_CSS,
2051
+ )