Yaz Hobooti commited on
Commit
fa64916
·
1 Parent(s): 37c62cf

Update pdf_comparator.py: latest changes

Browse files
Files changed (1) hide show
  1. pdf_comparator.py +1649 -262
pdf_comparator.py CHANGED
@@ -12,8 +12,20 @@ from skimage import color
12
  import json
13
  import tempfile
14
  import shutil
 
 
 
15
  import unicodedata
16
- import regex as re
 
 
 
 
 
 
 
 
 
17
 
18
  # Domain whitelist for spell checking
19
  DOMAIN_WHITELIST = {
@@ -27,15 +39,35 @@ DOMAIN_WHITELIST = {
27
  # lowercase everything in whitelist for comparisons
28
  DOMAIN_WHITELIST = {w.lower() for w in DOMAIN_WHITELIST}
29
 
30
- # Safe import for regex with fallback
 
 
 
 
 
 
 
 
31
  try:
32
- import regex as _re
33
- _USE_REGEX = True
34
  except ImportError:
35
- import re as _re
36
- _USE_REGEX = False
37
 
38
- TOKEN_PATTERN = r"(?:\p{L})(?:[\p{L}'-]{1,})" if _USE_REGEX else r"[A-Za-z][A-Za-z'-]{1,}"
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  class PDFComparator:
41
  def __init__(self):
@@ -43,7 +75,7 @@ class PDFComparator:
43
  self.english_spellchecker = SpellChecker(language='en')
44
  self.french_spellchecker = SpellChecker(language='fr')
45
 
46
- # Add domain whitelist to spell checkers
47
  for w in DOMAIN_WHITELIST:
48
  self.english_spellchecker.word_frequency.add(w)
49
  self.french_spellchecker.word_frequency.add(w)
@@ -54,205 +86,1173 @@ class PDFComparator:
54
  except LookupError:
55
  nltk.download('punkt')
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def enhance_image_for_tiny_fonts(self, image):
58
  """Enhance image specifically for tiny font OCR"""
59
  try:
 
60
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
 
61
  clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
62
  enhanced = clahe.apply(gray)
 
 
63
  denoised = cv2.bilateralFilter(enhanced, 9, 75, 75)
 
 
64
  gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
65
  unsharp_mask = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
 
 
66
  thresh = cv2.adaptiveThreshold(unsharp_mask, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
 
 
67
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
68
  cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
 
69
  return cleaned
 
70
  except Exception as e:
71
  print(f"Error enhancing image for tiny fonts: {str(e)}")
72
  return image
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def create_inverted_image(self, image):
75
- """Create inverted image for white text on dark backgrounds"""
76
  try:
 
77
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
 
78
  inverted = cv2.bitwise_not(gray)
79
- clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
 
 
80
  enhanced = clahe.apply(inverted)
 
 
81
  _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
 
82
  return thresh
 
83
  except Exception as e:
84
  print(f"Error creating inverted image: {str(e)}")
85
  return image
86
 
87
  def extract_color_channels(self, image):
88
- """Extract text from different color channels"""
89
  try:
90
- # RGB channels
 
 
 
 
 
 
91
  b, g, r = cv2.split(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- # HSV channels
94
- hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
95
- h, s, v = cv2.split(hsv)
 
96
 
97
- # LAB channels
98
- lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
99
- l, a, b_lab = cv2.split(lab)
100
 
101
- channels = [r, g, b, v, l]
102
- texts = []
 
 
 
 
 
103
 
104
- for channel in channels:
105
- _, thresh = cv2.threshold(channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
106
- text = pytesseract.image_to_string(thresh, config='--oem 3 --psm 6')
107
- if text.strip():
108
- texts.append(text)
 
 
 
109
 
110
- return texts
111
- except Exception as e:
112
- print(f"Error extracting color channels: {str(e)}")
113
- return []
114
-
115
- def create_edge_enhanced_image(self, image):
116
- """Create edge-enhanced image for text detection"""
117
- try:
118
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
119
- edges = cv2.Canny(gray, 50, 150)
120
- kernel = np.ones((2,2), np.uint8)
121
- dilated = cv2.dilate(edges, kernel, iterations=1)
122
- inverted = cv2.bitwise_not(dilated)
123
- return inverted
124
  except Exception as e:
125
- print(f"Error creating edge-enhanced image: {str(e)}")
126
- return image
127
-
128
- def ocr_with_multiple_configs(self, image):
129
- """Run OCR with multiple configurations and return best result"""
130
- configs = [
131
- '--oem 3 --psm 6', # Uniform block of text
132
- '--oem 3 --psm 8', # Single word
133
- '--oem 3 --psm 13', # Raw line
134
- '--oem 1 --psm 6', # LSTM + Uniform block
135
- '--oem 3 --psm 3', # Fully automatic page segmentation
136
- ]
137
-
138
- best_text = ""
139
- best_length = 0
140
-
141
- for config in configs:
142
- try:
143
- text = pytesseract.image_to_string(image, config=config)
144
- if len(text.strip()) > best_length:
145
- best_text = text
146
- best_length = len(text.strip())
147
- except Exception as e:
148
- print(f"OCR config {config} failed: {str(e)}")
149
- continue
150
 
151
- return best_text
152
-
153
- def extract_multi_color_text(self, image):
154
- """Extract text using multiple preprocessing methods"""
155
- texts = []
156
-
157
- # Method 1: Standard black text
158
- enhanced = self.enhance_image_for_tiny_fonts(image)
159
- text1 = self.ocr_with_multiple_configs(enhanced)
160
- if text1.strip():
161
- texts.append(text1)
162
-
163
- # Method 2: Inverted text (white on dark)
164
- inverted = self.create_inverted_image(image)
165
- text2 = self.ocr_with_multiple_configs(inverted)
166
- if text2.strip():
167
- texts.append(text2)
168
-
169
- # Method 3: Color channel separation
170
- color_texts = self.extract_color_channels(image)
171
- texts.extend(color_texts)
172
-
173
- # Method 4: Edge-enhanced
174
- edge_enhanced = self.create_edge_enhanced_image(image)
175
- text4 = self.ocr_with_multiple_configs(edge_enhanced)
176
- if text4.strip():
177
- texts.append(text4)
178
-
179
- # Combine all texts and return the best one
180
- combined_text = " ".join(texts)
181
- return combined_text
182
 
183
- def validate_pdf(self, pdf_path):
184
- """Validate that PDF contains '50 Carroll' using enhanced OCR"""
 
 
185
  try:
186
- # Multiple DPI settings for better detection
187
- dpi_settings = [200, 300, 400]
188
 
189
- for dpi in dpi_settings:
 
 
 
 
 
 
 
 
190
  try:
191
- images = convert_from_path(pdf_path, dpi=dpi)
 
 
 
 
 
 
 
192
 
193
- for page_num, image in enumerate(images):
194
- # Convert PIL image to OpenCV format
195
- opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
 
 
 
 
 
 
196
 
197
- # Enhanced text extraction
198
- text = self.extract_multi_color_text(opencv_image)
 
 
 
 
 
 
 
 
199
 
200
- # Check for "50 Carroll" with multiple patterns
201
- patterns = ["50 Carroll", "50 carroll", "50Carroll", "50 carroll"]
202
- for pattern in patterns:
203
- if pattern in text:
204
- return True
 
 
 
 
205
 
206
- # Also try standard OCR as fallback
207
- standard_text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6')
208
- for pattern in patterns:
209
- if pattern in standard_text:
210
- return True
211
-
212
  except Exception as e:
213
- print(f"DPI {dpi} failed: {str(e)}")
214
  continue
 
 
 
 
 
 
 
 
 
 
 
215
 
216
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  except Exception as e:
219
- raise Exception(f"Error validating PDF: {str(e)}")
 
220
 
221
- def extract_text_from_pdf(self, pdf_path):
222
- """Extract text from PDF using enhanced OCR"""
223
  try:
224
- # Use higher DPI for better text extraction
225
- images = convert_from_path(pdf_path, dpi=300)
226
- all_text = []
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- for page_num, image in enumerate(images):
229
- opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- # Enhanced text extraction
232
- text = self.extract_multi_color_text(opencv_image)
233
 
234
- # Fallback to standard OCR if enhanced extraction is empty
235
- if not text.strip():
236
- text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6')
237
 
238
- all_text.append({
239
- 'page': page_num + 1,
240
- 'text': text,
241
- 'image': image
242
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- return all_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- except Exception as e:
247
- raise Exception(f"Error extracting text from PDF: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- def _likely_french(self, token: str) -> bool:
250
- """Helper function to guess if a token is likely French"""
251
- if _USE_REGEX:
252
- # any Latin letter outside ASCII => probably FR (é, è, ç…)
253
- return bool(_re.search(r"[\p{Letter}&&\p{Latin}&&[^A-Za-z]]", token))
254
- # fallback: any non-ascii letter
255
- return any((not ('a' <= c.lower() <= 'z')) and c.isalpha() for c in token)
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  def check_spelling(self, text):
258
  """
@@ -263,9 +1263,11 @@ class PDFComparator:
263
  - Flags if unknown in its likely language (not both)
264
  """
265
  try:
 
266
  text = unicodedata.normalize("NFKC", text)
267
  text = text.replace("'", "'").replace(""", '"').replace(""", '"')
268
 
 
269
  tokens = _re.findall(TOKEN_PATTERN, text, flags=_re.UNICODE if _USE_REGEX else 0)
270
 
271
  issues = []
@@ -275,7 +1277,7 @@ class PDFComparator:
275
  # skip very short, short ALL-CAPS acronyms, and whitelisted terms
276
  if len(t) < 3:
277
  continue
278
- if raw.isupper() and len(raw) <= 3: # Changed from <=5 to <=3
279
  continue
280
  if t in DOMAIN_WHITELIST:
281
  continue
@@ -283,7 +1285,7 @@ class PDFComparator:
283
  miss_en = t in self.english_spellchecker.unknown([t])
284
  miss_fr = t in self.french_spellchecker.unknown([t])
285
 
286
- use_fr = self._likely_french(raw)
287
 
288
  # Prefer the likely language, but fall back to "either language unknown"
289
  if (use_fr and miss_fr) or ((not use_fr) and miss_en) or (miss_en and miss_fr):
@@ -299,76 +1301,18 @@ class PDFComparator:
299
  print(f"Error checking spelling: {e}")
300
  return []
301
 
302
- def annotate_spelling_errors_on_image(self, pil_image, misspelled):
303
- """
304
- Draw one red rectangle around each misspelled token using Tesseract word boxes.
305
- 'misspelled' must be a list of dicts with 'word' keys (from check_spelling).
306
- """
307
- if not misspelled:
308
- return pil_image
309
-
310
- def _norm(s: str) -> str:
311
- return unicodedata.normalize("NFKC", s).replace("'","'").strip(".,:;!?)(").lower()
312
-
313
- miss_set = {_norm(m["word"]) for m in misspelled}
314
-
315
- img = pil_image
316
- try:
317
- data = pytesseract.image_to_data(
318
- img,
319
- lang="eng+fra", # Added lang parameter
320
- config="--oem 3 --psm 6",
321
- output_type=pytesseract.Output.DICT,
322
- )
323
- except Exception as e:
324
- print("image_to_data failed:", e)
325
- return img
326
-
327
- draw = ImageDraw.Draw(img)
328
- n = len(data.get("text", []))
329
- for i in range(n):
330
- word = (data["text"][i] or "").strip()
331
- if not word:
332
- continue
333
- clean = _norm(word) # Used _norm function
334
-
335
- if clean and clean in miss_set:
336
- x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
337
- draw.rectangle([x, y, x + w, y + h], outline="red", width=4)
338
-
339
- return img
340
-
341
- def detect_barcodes_qr_codes(self, image):
342
- """Detect and decode barcodes and QR codes"""
343
- try:
344
- # Convert PIL image to OpenCV format
345
- opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
346
-
347
- # Decode barcodes and QR codes
348
- decoded_objects = decode(opencv_image)
349
-
350
- barcodes = []
351
- for obj in decoded_objects:
352
- barcode_info = {
353
- 'type': obj.type,
354
- 'data': obj.data.decode('utf-8'),
355
- 'rect': obj.rect
356
- }
357
- barcodes.append(barcode_info)
358
-
359
- return barcodes
360
-
361
- except Exception as e:
362
- print(f"Error detecting barcodes: {str(e)}")
363
- return []
364
-
365
  def compare_colors(self, image1, image2):
366
- """Compare colors between two images and return differences"""
367
  try:
 
 
368
  # Convert images to same size
369
  img1 = np.array(image1)
370
  img2 = np.array(image2)
371
 
 
 
 
372
  # Resize images to same dimensions
373
  height = min(img1.shape[0], img2.shape[0])
374
  width = min(img1.shape[1], img2.shape[1])
@@ -376,31 +1320,284 @@ class PDFComparator:
376
  img1_resized = cv2.resize(img1, (width, height))
377
  img2_resized = cv2.resize(img2, (width, height))
378
 
379
- # Convert to grayscale for comparison
380
- gray1 = cv2.cvtColor(img1_resized, cv2.COLOR_RGB2GRAY)
381
- gray2 = cv2.cvtColor(img2_resized, cv2.COLOR_RGB2GRAY)
 
 
382
 
383
- # Calculate structural similarity
384
- (score, diff) = ssim(gray1, gray2, full=True)
385
 
386
- # Convert difference to binary mask
387
- diff = (diff * 255).astype("uint8")
388
- thresh = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
389
 
390
- # Find contours of differences
391
- contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
 
392
 
393
- color_differences = []
394
- for contour in contours:
395
- if cv2.contourArea(contour) > 100: # Filter small differences
396
- x, y, w, h = cv2.boundingRect(contour)
397
- color_differences.append({
398
- 'x': x,
399
- 'y': y,
400
- 'width': w,
401
- 'height': h,
402
- 'area': cv2.contourArea(contour)
403
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
  return color_differences
406
 
@@ -408,37 +1605,200 @@ class PDFComparator:
408
  print(f"Error comparing colors: {str(e)}")
409
  return []
410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  def create_annotated_image(self, image, differences, output_path):
412
  """Create annotated image with red boxes around differences"""
413
  try:
 
 
 
414
  # Create a copy of the image
415
  annotated_image = image.copy()
416
  draw = ImageDraw.Draw(annotated_image)
417
 
418
  # Draw red rectangles around differences
419
- for diff in differences:
420
  x, y, w, h = diff['x'], diff['y'], diff['width'], diff['height']
421
- draw.rectangle([x, y, x + w, y + h], outline='red', width=3)
 
 
 
 
422
 
423
  # Save annotated image
424
  annotated_image.save(output_path)
 
425
 
426
  except Exception as e:
427
  print(f"Error creating annotated image: {str(e)}")
 
 
 
 
 
 
428
 
429
  def compare_pdfs(self, pdf1_path, pdf2_path, session_id):
430
- """Main comparison function"""
431
  try:
 
 
 
432
  # Validate both PDFs contain "50 Carroll"
 
433
  if not self.validate_pdf(pdf1_path):
434
  raise Exception("INVALID DOCUMENT")
435
 
 
436
  if not self.validate_pdf(pdf2_path):
437
  raise Exception("INVALID DOCUMENT")
438
 
439
  # Extract text and images from both PDFs
 
440
  pdf1_data = self.extract_text_from_pdf(pdf1_path)
 
 
 
 
441
  pdf2_data = self.extract_text_from_pdf(pdf2_path)
 
 
442
 
443
  # Initialize results
444
  results = {
@@ -456,7 +1816,9 @@ class PDFComparator:
456
  }
457
 
458
  # Compare text and check spelling
 
459
  for i, (page1, page2) in enumerate(zip(pdf1_data, pdf2_data)):
 
460
  page_results = {
461
  'page': i + 1,
462
  'text_differences': [],
@@ -468,34 +1830,66 @@ class PDFComparator:
468
  }
469
 
470
  # Check spelling for both PDFs
 
471
  page_results['spelling_issues_pdf1'] = self.check_spelling(page1['text'])
472
  page_results['spelling_issues_pdf2'] = self.check_spelling(page2['text'])
473
 
 
 
 
 
 
 
 
 
474
  # Create spelling-only annotated images (one box per error)
475
  spell_dir = f'static/results/{session_id}'
476
  os.makedirs(spell_dir, exist_ok=True)
 
477
  spell_img1 = page1['image'].copy()
478
  spell_img2 = page2['image'].copy()
479
  spell_img1 = self.annotate_spelling_errors_on_image(spell_img1, page_results['spelling_issues_pdf1'])
480
  spell_img2 = self.annotate_spelling_errors_on_image(spell_img2, page_results['spelling_issues_pdf2'])
 
481
  spell_path1 = f'{spell_dir}/page_{i+1}_pdf1_spelling.png'
482
  spell_path2 = f'{spell_dir}/page_{i+1}_pdf2_spelling.png'
483
  spell_img1.save(spell_path1)
484
  spell_img2.save(spell_path2)
 
 
 
 
 
 
 
485
 
486
  # Detect barcodes and QR codes
487
- page_results['barcodes_pdf1'] = self.detect_barcodes_qr_codes(page1['image'])
488
- page_results['barcodes_pdf2'] = self.detect_barcodes_qr_codes(page2['image'])
 
 
 
489
 
490
  # Compare colors
 
491
  color_diffs = self.compare_colors(page1['image'], page2['image'])
492
  page_results['color_differences'] = color_diffs
493
 
494
- # Create annotated images
 
 
 
 
 
 
 
 
 
 
 
 
495
  if color_diffs:
496
- output_dir = f'static/results/{session_id}'
497
- os.makedirs(output_dir, exist_ok=True)
498
-
499
  annotated_path1 = f'{output_dir}/page_{i+1}_pdf1_annotated.png'
500
  annotated_path2 = f'{output_dir}/page_{i+1}_pdf2_annotated.png'
501
 
@@ -504,32 +1898,19 @@ class PDFComparator:
504
 
505
  page_results['annotated_images'] = {
506
  'pdf1': f'results/{session_id}/page_{i+1}_pdf1_annotated.png',
507
- 'pdf2': f'results/{session_id}/page_{i+1}_pdf2_annotated.png',
508
- 'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
509
- 'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png'
510
  }
511
  else:
512
- # If no color differences, still save spelling images
513
  page_results['annotated_images'] = {
514
- 'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
515
- 'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png'
516
  }
517
 
518
- # Add spelling issues summary to text differences
519
- if page_results['spelling_issues_pdf1'] or page_results['spelling_issues_pdf2']:
520
- page_results['text_differences'].append({
521
- 'type': 'spelling',
522
- 'pdf1_issues': len(page_results['spelling_issues_pdf1']),
523
- 'pdf2_issues': len(page_results['spelling_issues_pdf2']),
524
- 'details': {
525
- 'pdf1': [issue['word'] for issue in page_results['spelling_issues_pdf1']],
526
- 'pdf2': [issue['word'] for issue in page_results['spelling_issues_pdf2']]
527
- }
528
- })
529
-
530
  results['text_comparison'].append(page_results)
531
 
532
  # Aggregate spelling issues
 
533
  all_spelling_issues = []
534
  for page in results['text_comparison']:
535
  all_spelling_issues.extend(page['spelling_issues_pdf1'])
@@ -545,7 +1926,13 @@ class PDFComparator:
545
 
546
  results['barcodes_qr_codes'] = all_barcodes
547
 
 
 
 
548
  return results
549
 
550
  except Exception as e:
551
- raise Exception(f"Error comparing PDFs: {str(e)}")
 
 
 
 
12
  import json
13
  import tempfile
14
  import shutil
15
+ import re
16
+ import time
17
+ import signal
18
  import unicodedata
19
+
20
+ # Safe import for regex with fallback
21
+ try:
22
+ import regex as _re
23
+ _USE_REGEX = True
24
+ except ImportError:
25
+ import re as _re
26
+ _USE_REGEX = False
27
+
28
+ TOKEN_PATTERN = r"(?:\p{L})(?:[\p{L}'-]{1,})" if _USE_REGEX else r"[A-Za-z][A-Za-z'-]{1,}"
29
 
30
  # Domain whitelist for spell checking
31
  DOMAIN_WHITELIST = {
 
39
  # lowercase everything in whitelist for comparisons
40
  DOMAIN_WHITELIST = {w.lower() for w in DOMAIN_WHITELIST}
41
 
42
+ def _likely_french(token: str) -> bool:
43
+ """Helper: quick language guess per token"""
44
+ if _USE_REGEX:
45
+ # any Latin letter outside ASCII => probably FR (é, è, ç…)
46
+ return bool(_re.search(r"[\p{Letter}&&\p{Latin}&&[^A-Za-z]]", token))
47
+ # fallback: any non-ascii letter
48
+ return any((not ('a' <= c.lower() <= 'z')) and c.isalpha() for c in token)
49
+
50
+ # Try to import additional barcode libraries
51
  try:
52
+ import zxing
53
+ ZXING_AVAILABLE = True
54
  except ImportError:
55
+ ZXING_AVAILABLE = False
56
+ print("zxing-cpp not available, using pyzbar only")
57
 
58
+ try:
59
+ from dbr import BarcodeReader
60
+ DBR_AVAILABLE = True
61
+ print("Dynamsoft Barcode Reader available")
62
+ except ImportError:
63
+ DBR_AVAILABLE = False
64
+ print("Dynamsoft Barcode Reader not available")
65
+
66
+ class TimeoutError(Exception):
67
+ pass
68
+
69
+ def timeout_handler(signum, frame):
70
+ raise TimeoutError("Operation timed out")
71
 
72
  class PDFComparator:
73
  def __init__(self):
 
75
  self.english_spellchecker = SpellChecker(language='en')
76
  self.french_spellchecker = SpellChecker(language='fr')
77
 
78
+ # Add domain whitelist words to spell checkers
79
  for w in DOMAIN_WHITELIST:
80
  self.english_spellchecker.word_frequency.add(w)
81
  self.french_spellchecker.word_frequency.add(w)
 
86
  except LookupError:
87
  nltk.download('punkt')
88
 
89
+ def safe_execute(self, func, *args, timeout=30, **kwargs):
90
+ """Execute a function with timeout protection"""
91
+ try:
92
+ # Set timeout signal
93
+ signal.signal(signal.SIGALRM, timeout_handler)
94
+ signal.alarm(timeout)
95
+
96
+ # Execute function
97
+ result = func(*args, **kwargs)
98
+
99
+ # Cancel timeout
100
+ signal.alarm(0)
101
+ return result
102
+
103
+ except TimeoutError:
104
+ print(f"Function {func.__name__} timed out after {timeout} seconds")
105
+ return None
106
+ except Exception as e:
107
+ print(f"Error in {func.__name__}: {str(e)}")
108
+ return None
109
+ finally:
110
+ signal.alarm(0)
111
+
112
+ def validate_pdf(self, pdf_path):
113
+ """Validate that PDF contains '50 Carroll' using enhanced OCR for tiny fonts"""
114
+ try:
115
+ print(f"Validating PDF: {pdf_path}")
116
+
117
+ # Try multiple DPI settings for better tiny font detection
118
+ dpi_settings = [300, 400, 600, 800]
119
+
120
+ for dpi in dpi_settings:
121
+ print(f"Trying DPI {dpi} for tiny font detection...")
122
+
123
+ # Convert PDF to images with current DPI
124
+ images = convert_from_path(pdf_path, dpi=dpi)
125
+ print(f"Converted PDF to {len(images)} images at {dpi} DPI")
126
+
127
+ for page_num, image in enumerate(images):
128
+ print(f"Processing page {page_num + 1} at {dpi} DPI...")
129
+
130
+ # Convert PIL image to OpenCV format
131
+ opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
132
+
133
+ # Enhanced preprocessing for tiny fonts
134
+ processed_image = self.enhance_image_for_tiny_fonts(opencv_image)
135
+
136
+ # Try multiple OCR configurations
137
+ ocr_configs = [
138
+ '--oem 3 --psm 6', # Assume uniform block of text
139
+ '--oem 3 --psm 8', # Single word
140
+ '--oem 3 --psm 13', # Raw line
141
+ '--oem 1 --psm 6', # Legacy engine
142
+ '--oem 3 --psm 3', # Fully automatic page segmentation
143
+ ]
144
+
145
+ for config in ocr_configs:
146
+ try:
147
+ # Perform OCR with current configuration
148
+ text = pytesseract.image_to_string(processed_image, config=config)
149
+
150
+ # Debug: Show first 300 characters of extracted text
151
+ debug_text = text[:300].replace('\n', ' ').replace('\r', ' ')
152
+ print(f"Page {page_num + 1} text (DPI {dpi}, config: {config}): '{debug_text}...'")
153
+
154
+ # Check for "50 Carroll" with various patterns
155
+ patterns = ["50 Carroll", "50 carroll", "50Carroll", "50carroll", "50 Carroll", "50 carroll"]
156
+ for pattern in patterns:
157
+ if pattern in text or pattern.lower() in text.lower():
158
+ print(f"Found '{pattern}' in page {page_num + 1} (DPI {dpi}, config: {config})")
159
+ return True
160
+
161
+ except Exception as ocr_error:
162
+ print(f"OCR error with config {config}: {str(ocr_error)}")
163
+ continue
164
+
165
+ print("Validation failed: '50 Carroll' not found in any page with any DPI or OCR config")
166
+ return False
167
+
168
+ except Exception as e:
169
+ print(f"Error validating PDF: {str(e)}")
170
+ raise Exception(f"Error validating PDF: {str(e)}")
171
+
172
  def enhance_image_for_tiny_fonts(self, image):
173
  """Enhance image specifically for tiny font OCR"""
174
  try:
175
+ # Convert to grayscale
176
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
177
+
178
+ # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
179
  clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
180
  enhanced = clahe.apply(gray)
181
+
182
+ # Apply bilateral filter to reduce noise while preserving edges
183
  denoised = cv2.bilateralFilter(enhanced, 9, 75, 75)
184
+
185
+ # Apply unsharp masking to enhance edges
186
  gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
187
  unsharp_mask = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
188
+
189
+ # Apply adaptive thresholding
190
  thresh = cv2.adaptiveThreshold(unsharp_mask, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
191
+
192
+ # Apply morphological operations to clean up
193
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
194
  cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
195
+
196
  return cleaned
197
+
198
  except Exception as e:
199
  print(f"Error enhancing image for tiny fonts: {str(e)}")
200
  return image
201
 
202
+ def extract_text_from_pdf(self, pdf_path):
203
+ """Extract text from PDF with multi-color text detection."""
204
+ try:
205
+ # Try to extract embedded text first
206
+ embedded_text = ""
207
+ try:
208
+ import fitz # PyMuPDF
209
+ doc = fitz.open(pdf_path)
210
+ all_text = []
211
+ any_text = False
212
+ for i, page in enumerate(doc):
213
+ t = page.get_text()
214
+ any_text |= bool(t.strip())
215
+ all_text.append({"page": i+1, "text": t, "image": None})
216
+ doc.close()
217
+ if any_text:
218
+ # render images for color diff/barcode only when needed
219
+ images = convert_from_path(pdf_path, dpi=600)
220
+ for d, im in zip(all_text, images):
221
+ d["image"] = im
222
+ return all_text
223
+ except Exception:
224
+ pass
225
+
226
+ # Enhanced OCR path with multi-color text detection
227
+ print("Extracting text with multi-color detection...")
228
+ images = convert_from_path(pdf_path, dpi=600)
229
+ all_text = []
230
+
231
+ for page_num, image in enumerate(images):
232
+ opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
233
+
234
+ # Multi-color text extraction
235
+ combined_text = self.extract_multi_color_text(opencv_image)
236
+
237
+ all_text.append({
238
+ 'page': page_num + 1,
239
+ 'text': combined_text,
240
+ 'image': image
241
+ })
242
+
243
+ return all_text
244
+
245
+ except Exception as e:
246
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
247
+
248
+ def extract_multi_color_text(self, image):
249
+ """Extract text from image in various colors using multiple preprocessing methods."""
250
+ try:
251
+ combined_text = ""
252
+
253
+ # Method 1: Standard black text detection
254
+ print("Method 1: Standard black text detection")
255
+ processed_image = self.enhance_image_for_tiny_fonts(image)
256
+ text1 = self.ocr_with_multiple_configs(processed_image)
257
+ combined_text += text1 + " "
258
+
259
+ # Method 2: Inverted text detection (for white text on dark background)
260
+ print("Method 2: Inverted text detection")
261
+ inverted_image = self.create_inverted_image(image)
262
+ text2 = self.ocr_with_multiple_configs(inverted_image)
263
+ combined_text += text2 + " "
264
+
265
+ # Method 3: Color channel separation for colored text
266
+ print("Method 3: Color channel separation")
267
+ for channel_name, channel_image in self.extract_color_channels(image):
268
+ text3 = self.ocr_with_multiple_configs(channel_image)
269
+ combined_text += text3 + " "
270
+
271
+ # Method 4: Edge-based text detection
272
+ print("Method 4: Edge-based text detection")
273
+ edge_image = self.create_edge_enhanced_image(image)
274
+ text4 = self.ocr_with_multiple_configs(edge_image)
275
+ combined_text += text4 + " "
276
+
277
+ return combined_text.strip()
278
+
279
+ except Exception as e:
280
+ print(f"Error in multi-color text extraction: {str(e)}")
281
+ return ""
282
+
283
  def create_inverted_image(self, image):
284
+ """Create inverted image for white text detection."""
285
  try:
286
+ # Convert to grayscale
287
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
288
+
289
+ # Invert the image
290
  inverted = cv2.bitwise_not(gray)
291
+
292
+ # Apply CLAHE for better contrast
293
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
294
  enhanced = clahe.apply(inverted)
295
+
296
+ # Apply thresholding
297
  _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
298
+
299
  return thresh
300
+
301
  except Exception as e:
302
  print(f"Error creating inverted image: {str(e)}")
303
  return image
304
 
305
  def extract_color_channels(self, image):
306
+ """Extract individual color channels for colored text detection."""
307
  try:
308
+ channels = []
309
+
310
+ # Convert to different color spaces
311
+ hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
312
+ lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
313
+
314
+ # Extract individual channels
315
  b, g, r = cv2.split(image)
316
+ h, s, v = cv2.split(hsv)
317
+ l, a, b_lab = cv2.split(lab)
318
+
319
+ # Create channel images for OCR
320
+ channel_images = [
321
+ ("blue", b),
322
+ ("green", g),
323
+ ("red", r),
324
+ ("hue", h),
325
+ ("saturation", s),
326
+ ("value", v),
327
+ ("lightness", l)
328
+ ]
329
+
330
+ for name, channel in channel_images:
331
+ # Apply thresholding to each channel
332
+ _, thresh = cv2.threshold(channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
333
+ channels.append((name, thresh))
334
+
335
+ return channels
336
+
337
+ except Exception as e:
338
+ print(f"Error extracting color channels: {str(e)}")
339
+ return []
340
+
341
+ def create_edge_enhanced_image(self, image):
342
+ """Create edge-enhanced image for text detection."""
343
+ try:
344
+ # Convert to grayscale
345
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
346
+
347
+ # Apply edge detection
348
+ edges = cv2.Canny(gray, 50, 150)
349
+
350
+ # Dilate edges to connect text components
351
+ kernel = np.ones((2, 2), np.uint8)
352
+ dilated = cv2.dilate(edges, kernel, iterations=1)
353
+
354
+ # Invert to get white text on black background
355
+ inverted = cv2.bitwise_not(dilated)
356
+
357
+ return inverted
358
+
359
+ except Exception as e:
360
+ print(f"Error creating edge-enhanced image: {str(e)}")
361
+ return image
362
+
363
+ def ocr_with_multiple_configs(self, image):
364
+ """Perform OCR with multiple configurations."""
365
+ try:
366
+ ocr_configs = [
367
+ '--oem 3 --psm 6', # Assume uniform block of text
368
+ '--oem 3 --psm 8', # Single word
369
+ '--oem 3 --psm 13', # Raw line
370
+ '--oem 1 --psm 6', # Legacy engine
371
+ ]
372
+
373
+ best_text = ""
374
+ for config in ocr_configs:
375
+ try:
376
+ text = pytesseract.image_to_string(image, config=config)
377
+ if len(text.strip()) > len(best_text.strip()):
378
+ best_text = text
379
+ except Exception as ocr_error:
380
+ print(f"OCR error with config {config}: {str(ocr_error)}")
381
+ continue
382
+
383
+ return best_text
384
+
385
+ except Exception as e:
386
+ print(f"Error in OCR with multiple configs: {str(e)}")
387
+ return ""
388
+
389
+ def annotate_spelling_errors_on_image(self, pil_image, misspelled):
390
+ """
391
+ Draw one red rectangle around each misspelled token using Tesseract word boxes.
392
+ 'misspelled' must be a list of dicts with 'word' keys (from check_spelling).
393
+ """
394
+ if not misspelled:
395
+ return pil_image
396
+
397
+ def _norm(s: str) -> str:
398
+ return unicodedata.normalize("NFKC", s).replace("'","'").strip(".,:;!?)(").lower()
399
+
400
+ # build a quick lookup of misspelled lowercase words
401
+ miss_set = {_norm(m["word"]) for m in misspelled}
402
+
403
+ # run word-level OCR to get boxes
404
+ img = pil_image
405
+ try:
406
+ data = pytesseract.image_to_data(
407
+ img,
408
+ lang="eng+fra",
409
+ config="--oem 3 --psm 6",
410
+ output_type=pytesseract.Output.DICT,
411
+ )
412
+ except Exception as e:
413
+ print("image_to_data failed:", e)
414
+ return img
415
+
416
+ draw = ImageDraw.Draw(img)
417
+ n = len(data.get("text", []))
418
+ for i in range(n):
419
+ word = (data["text"][i] or "").strip()
420
+ if not word:
421
+ continue
422
+ clean = _norm(word)
423
+
424
+ if clean and clean in miss_set:
425
+ x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
426
+ # draw a distinct box for this one word
427
+ draw.rectangle([x, y, x + w, y + h], outline="red", width=4)
428
+
429
+ return img
430
+
431
+ def detect_barcodes_qr_codes(self, image):
432
+ """Detect and decode barcodes and QR codes with timeout protection"""
433
+ try:
434
+ print("Starting barcode detection...")
435
+ start_time = time.time()
436
+
437
+ # Convert PIL image to OpenCV format
438
+ opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
439
+
440
+ all_barcodes = []
441
+
442
+ # Method 1: Basic pyzbar detection (fastest)
443
+ print("Method 1: Basic pyzbar detection")
444
+ pyzbar_results = self.detect_with_pyzbar_basic(opencv_image)
445
+ if pyzbar_results:
446
+ all_barcodes.extend(pyzbar_results)
447
+ print(f"Found {len(pyzbar_results)} barcodes with basic pyzbar")
448
+
449
+ # Method 2: Dynamsoft Barcode Reader (if available)
450
+ if DBR_AVAILABLE:
451
+ print("Method 2: Dynamsoft Barcode Reader")
452
+ dbr_results = self.detect_with_dynamsoft(opencv_image)
453
+ if dbr_results:
454
+ all_barcodes.extend(dbr_results)
455
+ print(f"Found {len(dbr_results)} barcodes with Dynamsoft")
456
+
457
+ # Method 3: Enhanced preprocessing (always run for better detection)
458
+ print("Method 3: Enhanced preprocessing")
459
+ enhanced_results = self.detect_with_enhanced_preprocessing(opencv_image)
460
+ if enhanced_results:
461
+ all_barcodes.extend(enhanced_results)
462
+ print(f"Found {len(enhanced_results)} additional barcodes with enhanced preprocessing")
463
+
464
+ # Method 4: Small barcode detection (always run for better detection)
465
+ print("Method 4: Small barcode detection")
466
+ small_results = self.detect_small_barcodes_simple(opencv_image)
467
+ if small_results:
468
+ all_barcodes.extend(small_results)
469
+ print(f"Found {len(small_results)} additional small barcodes")
470
+
471
+ # Remove duplicates
472
+ unique_barcodes = self.remove_duplicate_barcodes(all_barcodes)
473
+
474
+ # Enhance results
475
+ enhanced_barcodes = self.enhance_barcode_data(unique_barcodes)
476
+
477
+ elapsed_time = time.time() - start_time
478
+ print(f"Barcode detection completed in {elapsed_time:.2f} seconds. Found {len(enhanced_barcodes)} unique barcodes.")
479
+
480
+ return enhanced_barcodes
481
+
482
+ except Exception as e:
483
+ print(f"Error in barcode detection: {str(e)}")
484
+ return []
485
+
486
+ def detect_with_pyzbar_basic(self, image):
487
+ """Basic pyzbar detection without complex preprocessing"""
488
+ results = []
489
+
490
+ try:
491
+ # Simple grayscale conversion
492
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
493
+
494
+ # Try original image
495
+ decoded_objects = decode(gray)
496
+ for obj in decoded_objects:
497
+ barcode_info = {
498
+ 'type': obj.type,
499
+ 'data': obj.data.decode('utf-8', errors='ignore'),
500
+ 'rect': obj.rect,
501
+ 'polygon': obj.polygon,
502
+ 'quality': getattr(obj, 'quality', 0),
503
+ 'orientation': self.detect_barcode_orientation(obj),
504
+ 'method': 'pyzbar_basic'
505
+ }
506
+
507
+ if 'databar' in obj.type.lower():
508
+ barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
509
+
510
+ results.append(barcode_info)
511
+
512
+ # Try with simple contrast enhancement
513
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
514
+ enhanced = clahe.apply(gray)
515
+ decoded_objects = decode(enhanced)
516
+
517
+ for obj in decoded_objects:
518
+ barcode_info = {
519
+ 'type': obj.type,
520
+ 'data': obj.data.decode('utf-8', errors='ignore'),
521
+ 'rect': obj.rect,
522
+ 'polygon': obj.polygon,
523
+ 'quality': getattr(obj, 'quality', 0),
524
+ 'orientation': self.detect_barcode_orientation(obj),
525
+ 'method': 'pyzbar_enhanced'
526
+ }
527
+
528
+ if 'databar' in obj.type.lower():
529
+ barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
530
+
531
+ results.append(barcode_info)
532
+
533
+ except Exception as e:
534
+ print(f"Error in basic pyzbar detection: {str(e)}")
535
+
536
+ return results
537
+
538
+ def detect_with_dynamsoft(self, image):
539
+ """Detect barcodes using Dynamsoft Barcode Reader"""
540
+ results = []
541
+
542
+ try:
543
+ if not DBR_AVAILABLE:
544
+ return results
545
+
546
+ # Initialize Dynamsoft Barcode Reader
547
+ reader = BarcodeReader()
548
+
549
+ # Convert OpenCV image to bytes for Dynamsoft
550
+ success, buffer = cv2.imencode('.png', image)
551
+ if not success:
552
+ print("Failed to encode image for Dynamsoft")
553
+ return results
554
+
555
+ image_bytes = buffer.tobytes()
556
+
557
+ # Decode barcodes
558
+ text_results = reader.decode_file_stream(image_bytes)
559
+
560
+ for result in text_results:
561
+ barcode_info = {
562
+ 'type': result.barcode_format_string,
563
+ 'data': result.barcode_text,
564
+ 'rect': type('Rect', (), {
565
+ 'left': result.localization_result.x1,
566
+ 'top': result.localization_result.y1,
567
+ 'width': result.localization_result.x2 - result.localization_result.x1,
568
+ 'height': result.localization_result.y2 - result.localization_result.y1
569
+ })(),
570
+ 'polygon': [
571
+ (result.localization_result.x1, result.localization_result.y1),
572
+ (result.localization_result.x2, result.localization_result.y1),
573
+ (result.localization_result.x2, result.localization_result.y2),
574
+ (result.localization_result.x1, result.localization_result.y2)
575
+ ],
576
+ 'quality': result.confidence,
577
+ 'orientation': self.detect_barcode_orientation(result),
578
+ 'method': 'dynamsoft'
579
+ }
580
+
581
+ # Enhanced DataBar Expanded detection
582
+ if 'databar' in result.barcode_format_string.lower() or 'expanded' in result.barcode_format_string.lower():
583
+ barcode_info['expanded_data'] = self.parse_databar_expanded(result.barcode_text)
584
+
585
+ results.append(barcode_info)
586
+
587
+ print(f"Dynamsoft detected {len(results)} barcodes")
588
+
589
+ except Exception as e:
590
+ print(f"Error in Dynamsoft detection: {str(e)}")
591
+
592
+ return results
593
+
594
+ def detect_with_enhanced_preprocessing(self, image):
595
+ """Enhanced preprocessing with limited methods"""
596
+ results = []
597
+
598
+ try:
599
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
600
+
601
+ # Limited preprocessing methods
602
+ processed_images = [
603
+ gray, # Original
604
+ cv2.resize(gray, (gray.shape[1] * 3, gray.shape[0] * 3), interpolation=cv2.INTER_CUBIC), # 3x scale
605
+ cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2), # Adaptive threshold
606
+ ]
607
+
608
+ for i, processed_image in enumerate(processed_images):
609
+ try:
610
+ decoded_objects = decode(processed_image)
611
+
612
+ for obj in decoded_objects:
613
+ barcode_info = {
614
+ 'type': obj.type,
615
+ 'data': obj.data.decode('utf-8', errors='ignore'),
616
+ 'rect': obj.rect,
617
+ 'polygon': obj.polygon,
618
+ 'quality': getattr(obj, 'quality', 0),
619
+ 'orientation': self.detect_barcode_orientation(obj),
620
+ 'method': f'enhanced_preprocessing_{i}'
621
+ }
622
+
623
+ if 'databar' in obj.type.lower():
624
+ barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
625
+
626
+ results.append(barcode_info)
627
+
628
+ except Exception as e:
629
+ print(f"Error in enhanced preprocessing method {i}: {str(e)}")
630
+ continue
631
+
632
+ except Exception as e:
633
+ print(f"Error in enhanced preprocessing: {str(e)}")
634
+
635
+ return results
636
+
637
+ def detect_small_barcodes_simple(self, image):
638
+ """Simplified small barcode detection"""
639
+ results = []
640
+
641
+ try:
642
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
643
+
644
+ # Only try 3x and 4x scaling
645
+ scale_factors = [3.0, 4.0]
646
+
647
+ for scale in scale_factors:
648
+ try:
649
+ height, width = gray.shape
650
+ new_height, new_width = int(height * scale), int(width * scale)
651
+ scaled = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
652
+
653
+ decoded_objects = decode(scaled)
654
+
655
+ for obj in decoded_objects:
656
+ # Scale back coordinates
657
+ scale_factor = width / new_width
658
+ scaled_rect = type('Rect', (), {
659
+ 'left': int(obj.rect.left * scale_factor),
660
+ 'top': int(obj.rect.top * scale_factor),
661
+ 'width': int(obj.rect.width * scale_factor),
662
+ 'height': int(obj.rect.height * scale_factor)
663
+ })()
664
+
665
+ barcode_info = {
666
+ 'type': obj.type,
667
+ 'data': obj.data.decode('utf-8', errors='ignore'),
668
+ 'rect': scaled_rect,
669
+ 'polygon': obj.polygon,
670
+ 'quality': getattr(obj, 'quality', 0),
671
+ 'orientation': self.detect_barcode_orientation(obj),
672
+ 'method': f'small_barcode_{scale}x',
673
+ 'size_category': 'small'
674
+ }
675
+
676
+ if 'databar' in obj.type.lower():
677
+ barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
678
+
679
+ results.append(barcode_info)
680
+
681
+ except Exception as e:
682
+ print(f"Error in small barcode detection at {scale}x: {str(e)}")
683
+ continue
684
+
685
+ except Exception as e:
686
+ print(f"Error in small barcode detection: {str(e)}")
687
+
688
+ return results
689
+
690
+ def preprocess_image_for_ocr(self, image):
691
+ """Preprocess image for better OCR results"""
692
+ try:
693
+ # Convert to grayscale
694
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
695
+
696
+ # Apply different preprocessing techniques
697
+
698
+ # 1. Resize image to improve small text recognition
699
+ height, width = gray.shape
700
+ scale_factor = 3.0 # Scale up for better small font recognition
701
+ new_height, new_width = int(height * scale_factor), int(width * scale_factor)
702
+ resized = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
703
+
704
+ # 2. Apply Gaussian blur to reduce noise
705
+ blurred = cv2.GaussianBlur(resized, (1, 1), 0)
706
+
707
+ # 3. Apply adaptive thresholding for better text separation
708
+ thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
709
+
710
+ # 4. Apply morphological operations to clean up text
711
+ kernel = np.ones((1, 1), np.uint8)
712
+ cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
713
+
714
+ # 5. Apply contrast enhancement
715
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
716
+ enhanced = clahe.apply(cleaned)
717
+
718
+ return enhanced
719
+
720
+ except Exception as e:
721
+ print(f"Error preprocessing image: {str(e)}")
722
+ return image # Return original if preprocessing fails
723
+
724
+ def preprocess_for_barcode_detection(self, image):
725
+ """Preprocess image with multiple techniques for better barcode detection"""
726
+ processed_images = [image] # Start with original
727
+
728
+ try:
729
+ # Convert to grayscale
730
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
731
+ processed_images.append(gray)
732
+
733
+ # Apply different preprocessing techniques
734
+
735
+ # 1. Contrast enhancement
736
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
737
+ enhanced = clahe.apply(gray)
738
+ processed_images.append(enhanced)
739
+
740
+ # 2. Gaussian blur for noise reduction
741
+ blurred = cv2.GaussianBlur(gray, (3, 3), 0)
742
+ processed_images.append(blurred)
743
+
744
+ # 3. Adaptive thresholding
745
+ thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
746
+ processed_images.append(thresh)
747
+
748
+ # 4. Edge enhancement for better barcode detection
749
+ kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
750
+ sharpened = cv2.filter2D(gray, -1, kernel)
751
+ processed_images.append(sharpened)
752
+
753
+ # 5. Scale up for small barcodes
754
+ height, width = gray.shape
755
+ scale_factor = 3.0
756
+ new_height, new_width = int(height * scale_factor), int(width * scale_factor)
757
+ scaled = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
758
+ processed_images.append(scaled)
759
+
760
+ except Exception as e:
761
+ print(f"Error in barcode preprocessing: {str(e)}")
762
+
763
+ return processed_images
764
+
765
+ def preprocess_for_databar(self, gray_image):
766
+ """Specialized preprocessing for DataBar Expanded Stacked barcodes"""
767
+ processed_images = []
768
+
769
+ try:
770
+ # Original grayscale
771
+ processed_images.append(gray_image)
772
+
773
+ # 1. High contrast enhancement for DataBar
774
+ clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8, 8))
775
+ enhanced = clahe.apply(gray_image)
776
+ processed_images.append(enhanced)
777
+
778
+ # 2. Bilateral filter to preserve edges while reducing noise
779
+ bilateral = cv2.bilateralFilter(gray_image, 9, 75, 75)
780
+ processed_images.append(bilateral)
781
+
782
+ # 3. Adaptive thresholding with different parameters
783
+ thresh1 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 2)
784
+ processed_images.append(thresh1)
785
+
786
+ thresh2 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
787
+ processed_images.append(thresh2)
788
+
789
+ # 4. Scale up for better DataBar detection
790
+ height, width = gray_image.shape
791
+ scale_factors = [2.0, 3.0, 4.0]
792
+
793
+ for scale in scale_factors:
794
+ new_height, new_width = int(height * scale), int(width * scale)
795
+ scaled = cv2.resize(gray_image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
796
+ processed_images.append(scaled)
797
+
798
+ # 5. Edge enhancement specifically for DataBar
799
+ kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
800
+ sharpened = cv2.filter2D(gray_image, -1, kernel)
801
+ processed_images.append(sharpened)
802
+
803
+ # 6. Morphological operations for DataBar
804
+ kernel = np.ones((2, 2), np.uint8)
805
+ morphed = cv2.morphologyEx(gray_image, cv2.MORPH_CLOSE, kernel)
806
+ processed_images.append(morphed)
807
+
808
+ except Exception as e:
809
+ print(f"Error in DataBar preprocessing: {str(e)}")
810
+
811
+ return processed_images
812
+
813
+ def detect_with_transformations(self, image):
814
+ """Detect barcodes using multiple image transformations"""
815
+ results = []
816
+
817
+ try:
818
+ # Try different rotations
819
+ angles = [0, 90, 180, 270]
820
+
821
+ for angle in angles:
822
+ if angle == 0:
823
+ rotated_image = image
824
+ else:
825
+ height, width = image.shape[:2]
826
+ center = (width // 2, height // 2)
827
+ rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
828
+ rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
829
+
830
+ # Try to detect barcodes in rotated image
831
+ try:
832
+ decoded_objects = decode(rotated_image)
833
+
834
+ for obj in decoded_objects:
835
+ barcode_info = {
836
+ 'type': obj.type,
837
+ 'data': obj.data.decode('utf-8', errors='ignore'),
838
+ 'rect': obj.rect,
839
+ 'polygon': obj.polygon,
840
+ 'quality': getattr(obj, 'quality', 0),
841
+ 'orientation': f"{angle}°",
842
+ 'method': f'transform_{angle}deg'
843
+ }
844
+
845
+ # Enhanced DataBar Expanded detection
846
+ if 'databar' in obj.type.lower() or 'expanded' in obj.type.lower():
847
+ barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
848
+
849
+ # Check for multi-stack barcodes
850
+ if self.is_multi_stack_barcode(obj, rotated_image):
851
+ barcode_info['stack_type'] = self.detect_stack_type(obj, rotated_image)
852
+
853
+ results.append(barcode_info)
854
+
855
+ except Exception as e:
856
+ print(f"Error in transformation detection at {angle}°: {str(e)}")
857
+ continue
858
+
859
+ except Exception as e:
860
+ print(f"Error in transformation detection: {str(e)}")
861
+
862
+ return results
863
+
864
+ def detect_small_barcodes(self, image):
865
+ """Specialized detection for small barcodes and QR codes"""
866
+ results = []
867
+
868
+ try:
869
+ # Convert to grayscale
870
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
871
+
872
+ # Apply specialized preprocessing for small barcodes
873
+ processed_images = self.preprocess_for_small_barcodes(gray)
874
+
875
+ for processed_image in processed_images:
876
+ try:
877
+ decoded_objects = decode(processed_image)
878
+
879
+ for obj in decoded_objects:
880
+ # Check if this is a small barcode (less than 50x50 pixels)
881
+ if obj.rect.width < 50 or obj.rect.height < 50:
882
+ barcode_info = {
883
+ 'type': obj.type,
884
+ 'data': obj.data.decode('utf-8', errors='ignore'),
885
+ 'rect': obj.rect,
886
+ 'polygon': obj.polygon,
887
+ 'quality': getattr(obj, 'quality', 0),
888
+ 'orientation': self.detect_barcode_orientation(obj),
889
+ 'method': 'small_barcode_detection',
890
+ 'size_category': 'small'
891
+ }
892
+
893
+ # Enhanced DataBar Expanded detection
894
+ if 'databar' in obj.type.lower() or 'expanded' in obj.type.lower():
895
+ barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
896
+
897
+ # Check for multi-stack barcodes
898
+ if self.is_multi_stack_barcode(obj, image):
899
+ barcode_info['stack_type'] = self.detect_stack_type(obj, image)
900
+
901
+ results.append(barcode_info)
902
+
903
+ except Exception as e:
904
+ print(f"Error in small barcode detection: {str(e)}")
905
+ continue
906
+
907
+ except Exception as e:
908
+ print(f"Error in small barcode preprocessing: {str(e)}")
909
+
910
+ return results
911
+
912
+ def preprocess_for_small_barcodes(self, gray_image):
913
+ """Specialized preprocessing for small barcodes and QR codes"""
914
+ processed_images = []
915
+
916
+ try:
917
+ # Original grayscale
918
+ processed_images.append(gray_image)
919
+
920
+ # 1. Multiple high-resolution scaling for small barcodes
921
+ height, width = gray_image.shape
922
+ scale_factors = [4.0, 5.0, 6.0, 8.0] # Higher scaling for small barcodes
923
+
924
+ for scale in scale_factors:
925
+ new_height, new_width = int(height * scale), int(width * scale)
926
+ scaled = cv2.resize(gray_image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
927
+ processed_images.append(scaled)
928
+
929
+ # 2. Aggressive contrast enhancement
930
+ clahe = cv2.createCLAHE(clipLimit=5.0, tileGridSize=(8, 8))
931
+ enhanced = clahe.apply(gray_image)
932
+ processed_images.append(enhanced)
933
+
934
+ # 3. Unsharp masking for edge enhancement
935
+ gaussian = cv2.GaussianBlur(gray_image, (0, 0), 2.0)
936
+ unsharp = cv2.addWeighted(gray_image, 1.5, gaussian, -0.5, 0)
937
+ processed_images.append(unsharp)
938
+
939
+ # 4. Multiple thresholding methods
940
+ # Otsu's thresholding
941
+ _, otsu = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
942
+ processed_images.append(otsu)
943
+
944
+ # Adaptive thresholding with different parameters
945
+ adaptive1 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 2)
946
+ processed_images.append(adaptive1)
947
+
948
+ adaptive2 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 7, 2)
949
+ processed_images.append(adaptive2)
950
 
951
+ # 5. Noise reduction with different methods
952
+ # Bilateral filter
953
+ bilateral = cv2.bilateralFilter(gray_image, 9, 75, 75)
954
+ processed_images.append(bilateral)
955
 
956
+ # Median filter
957
+ median = cv2.medianBlur(gray_image, 3)
958
+ processed_images.append(median)
959
 
960
+ # 6. Edge detection and enhancement
961
+ # Sobel edge detection
962
+ sobel_x = cv2.Sobel(gray_image, cv2.CV_64F, 1, 0, ksize=3)
963
+ sobel_y = cv2.Sobel(gray_image, cv2.CV_64F, 0, 1, ksize=3)
964
+ sobel = np.sqrt(sobel_x**2 + sobel_y**2)
965
+ sobel = np.uint8(sobel * 255 / sobel.max())
966
+ processed_images.append(sobel)
967
 
968
+ # 7. Morphological operations for small barcode cleanup
969
+ kernel = np.ones((2, 2), np.uint8)
970
+ morphed_close = cv2.morphologyEx(gray_image, cv2.MORPH_CLOSE, kernel)
971
+ processed_images.append(morphed_close)
972
+
973
+ kernel_open = np.ones((1, 1), np.uint8)
974
+ morphed_open = cv2.morphologyEx(gray_image, cv2.MORPH_OPEN, kernel_open)
975
+ processed_images.append(morphed_open)
976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
  except Exception as e:
978
+ print(f"Error in small barcode preprocessing: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
 
980
+ return processed_images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
981
 
982
+ def detect_with_high_resolution(self, image):
983
+ """Detect barcodes using high-resolution processing"""
984
+ results = []
985
+
986
  try:
987
+ # Convert to grayscale
988
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
989
 
990
+ # Process at multiple high resolutions
991
+ height, width = gray.shape
992
+ resolutions = [
993
+ (int(width * 3), int(height * 3)), # 3x resolution
994
+ (int(width * 4), int(height * 4)), # 4x resolution
995
+ (int(width * 6), int(height * 6)) # 6x resolution
996
+ ]
997
+
998
+ for new_width, new_height in resolutions:
999
  try:
1000
+ # Resize with high-quality interpolation
1001
+ resized = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
1002
+
1003
+ # Apply high-resolution preprocessing
1004
+ processed = self.preprocess_high_resolution(resized)
1005
+
1006
+ # Try to detect barcodes
1007
+ decoded_objects = decode(processed)
1008
 
1009
+ for obj in decoded_objects:
1010
+ # Scale back the coordinates to original image size
1011
+ scale_factor = width / new_width
1012
+ scaled_rect = type('Rect', (), {
1013
+ 'left': int(obj.rect.left * scale_factor),
1014
+ 'top': int(obj.rect.top * scale_factor),
1015
+ 'width': int(obj.rect.width * scale_factor),
1016
+ 'height': int(obj.rect.height * scale_factor)
1017
+ })()
1018
 
1019
+ barcode_info = {
1020
+ 'type': obj.type,
1021
+ 'data': obj.data.decode('utf-8', errors='ignore'),
1022
+ 'rect': scaled_rect,
1023
+ 'polygon': obj.polygon,
1024
+ 'quality': getattr(obj, 'quality', 0),
1025
+ 'orientation': self.detect_barcode_orientation(obj),
1026
+ 'method': f'high_res_{new_width}x{new_height}',
1027
+ 'resolution': f'{new_width}x{new_height}'
1028
+ }
1029
 
1030
+ # Enhanced DataBar Expanded detection
1031
+ if 'databar' in obj.type.lower() or 'expanded' in obj.type.lower():
1032
+ barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
1033
+
1034
+ # Check for multi-stack barcodes
1035
+ if self.is_multi_stack_barcode(obj, image):
1036
+ barcode_info['stack_type'] = self.detect_stack_type(obj, image)
1037
+
1038
+ results.append(barcode_info)
1039
 
 
 
 
 
 
 
1040
  except Exception as e:
1041
+ print(f"Error in high-resolution detection at {new_width}x{new_height}: {str(e)}")
1042
  continue
1043
+
1044
+ except Exception as e:
1045
+ print(f"Error in high-resolution detection: {str(e)}")
1046
+
1047
+ return results
1048
+
1049
+ def preprocess_high_resolution(self, image):
1050
+ """Preprocessing optimized for high-resolution images"""
1051
+ try:
1052
+ # 1. High-quality noise reduction
1053
+ denoised = cv2.fastNlMeansDenoising(image)
1054
 
1055
+ # 2. Advanced contrast enhancement
1056
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
1057
+ enhanced = clahe.apply(denoised)
1058
+
1059
+ # 3. Edge-preserving smoothing
1060
+ bilateral = cv2.bilateralFilter(enhanced, 9, 75, 75)
1061
+
1062
+ # 4. Sharpening
1063
+ kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
1064
+ sharpened = cv2.filter2D(bilateral, -1, kernel)
1065
+
1066
+ # 5. Adaptive thresholding for high-res
1067
+ thresh = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
1068
+
1069
+ return thresh
1070
 
1071
  except Exception as e:
1072
+ print(f"Error in high-resolution preprocessing: {str(e)}")
1073
+ return image
1074
 
1075
+ def detect_barcode_orientation(self, barcode_obj):
1076
+ """Detect the orientation of the barcode"""
1077
  try:
1078
+ if hasattr(barcode_obj, 'polygon') and len(barcode_obj.polygon) >= 4:
1079
+ # Calculate orientation based on polygon points
1080
+ points = np.array(barcode_obj.polygon)
1081
+ # Calculate the angle of the longest edge
1082
+ edges = []
1083
+ for i in range(4):
1084
+ p1 = points[i]
1085
+ p2 = points[(i + 1) % 4]
1086
+ edge_length = np.linalg.norm(p2 - p1)
1087
+ angle = np.arctan2(p2[1] - p1[1], p2[0] - p1[0]) * 180 / np.pi
1088
+ edges.append((edge_length, angle))
1089
+
1090
+ # Find the longest edge (likely the main barcode direction)
1091
+ longest_edge = max(edges, key=lambda x: x[0])
1092
+ return f"{longest_edge[1]:.1f}°"
1093
 
1094
+ return "Unknown"
1095
+ except:
1096
+ return "Unknown"
1097
+
1098
+ def parse_databar_expanded(self, data):
1099
+ """Parse DataBar Expanded barcode data"""
1100
+ try:
1101
+ # DataBar Expanded can contain multiple data fields
1102
+ # Format: [01]12345678901234[3101]123[3102]456
1103
+ parsed_data = {}
1104
+
1105
+ # Extract GS1 Application Identifiers
1106
+ ai_pattern = r'\[(\d{2,4})\]([^\[]+)'
1107
+ matches = re.findall(ai_pattern, data)
1108
+
1109
+ for ai, value in matches:
1110
+ parsed_data[f"AI {ai}"] = value
1111
+
1112
+ # If no AI pattern found, return original data
1113
+ if not parsed_data:
1114
+ parsed_data["Raw Data"] = data
1115
+
1116
+ return parsed_data
1117
+
1118
+ except Exception as e:
1119
+ return {"Raw Data": data, "Parse Error": str(e)}
1120
+
1121
+ def is_multi_stack_barcode(self, barcode_obj, image):
1122
+ """Detect if this is a multi-stack barcode"""
1123
+ try:
1124
+ if hasattr(barcode_obj, 'rect'):
1125
+ x, y, w, h = barcode_obj.rect
1126
 
1127
+ # Check if the barcode is unusually tall (indicating stacked format)
1128
+ aspect_ratio = h / w if w > 0 else 0
1129
 
1130
+ # DataBar Expanded and other stacked barcodes typically have aspect ratios > 0.3
1131
+ return aspect_ratio > 0.3
 
1132
 
1133
+ except:
1134
+ pass
1135
+
1136
+ return False
1137
+
1138
+ def detect_stack_type(self, barcode_obj, image):
1139
+ """Detect the type of multi-stack barcode"""
1140
+ try:
1141
+ if hasattr(barcode_obj, 'rect'):
1142
+ x, y, w, h = barcode_obj.rect
1143
+ aspect_ratio = h / w if w > 0 else 0
1144
+
1145
+ # Classify based on aspect ratio and barcode type
1146
+ if 'databar' in barcode_obj.type.lower():
1147
+ if aspect_ratio > 0.5:
1148
+ return "Quad Stack"
1149
+ elif aspect_ratio > 0.35:
1150
+ return "Triple Stack"
1151
+ elif aspect_ratio > 0.25:
1152
+ return "Double Stack"
1153
+ else:
1154
+ return "Single Stack"
1155
+ else:
1156
+ # For other barcode types
1157
+ if aspect_ratio > 0.4:
1158
+ return "Multi-Stack"
1159
+ else:
1160
+ return "Single Stack"
1161
+
1162
+ except:
1163
+ pass
1164
+
1165
+ return "Unknown"
1166
+
1167
+ def remove_duplicate_barcodes(self, barcodes):
1168
+ """Remove duplicate barcodes based on position and data"""
1169
+ unique_barcodes = []
1170
+ seen_positions = set()
1171
+ seen_data = set()
1172
+
1173
+ for barcode in barcodes:
1174
+ # Create position signature
1175
+ pos_signature = f"{barcode['rect'].left},{barcode['rect'].top},{barcode['rect'].width},{barcode['rect'].height}"
1176
+ data_signature = barcode['data']
1177
 
1178
+ # Check if we've seen this position or data before
1179
+ if pos_signature not in seen_positions and data_signature not in seen_data:
1180
+ unique_barcodes.append(barcode)
1181
+ seen_positions.add(pos_signature)
1182
+ seen_data.add(data_signature)
1183
+
1184
+ return unique_barcodes
1185
+
1186
+ def enhance_barcode_data(self, barcodes):
1187
+ """Enhance barcode data with additional analysis"""
1188
+ enhanced_barcodes = []
1189
+
1190
+ for barcode in barcodes:
1191
+ # Add confidence score based on method and quality
1192
+ confidence = self.calculate_confidence(barcode)
1193
+ barcode['confidence'] = confidence
1194
 
1195
+ # Add GS1 validation for DataBar
1196
+ if 'databar' in barcode['type'].lower():
1197
+ barcode['gs1_validated'] = self.validate_gs1_format(barcode['data'])
1198
+
1199
+ enhanced_barcodes.append(barcode)
1200
+
1201
+ return enhanced_barcodes
1202
+
1203
+ def calculate_confidence(self, barcode):
1204
+ """Calculate confidence score for barcode detection"""
1205
+ confidence = 50 # Base confidence
1206
+
1207
+ # Method confidence
1208
+ method_scores = {
1209
+ 'pyzbar_basic': 70,
1210
+ 'pyzbar_enhanced': 70,
1211
+ 'dynamsoft': 85, # Dynamsoft typically has higher accuracy
1212
+ 'enhanced_preprocessing_0': 65,
1213
+ 'enhanced_preprocessing_1': 60,
1214
+ 'enhanced_preprocessing_2': 55,
1215
+ 'transform_0deg': 60,
1216
+ 'transform_90deg': 50,
1217
+ 'transform_180deg': 50,
1218
+ 'transform_270deg': 50,
1219
+ 'small_barcode_detection': 75,
1220
+ 'high_res_2x': 70,
1221
+ 'high_res_3x': 65,
1222
+ 'high_res_4x': 60
1223
+ }
1224
+
1225
+ if barcode.get('method') in method_scores:
1226
+ confidence += method_scores[barcode['method']]
1227
+
1228
+ # Quality score
1229
+ if barcode.get('quality', 0) > 0:
1230
+ confidence += min(barcode['quality'], 20)
1231
+
1232
+ # DataBar specific confidence
1233
+ if 'databar' in barcode['type'].lower():
1234
+ confidence += 10
1235
+
1236
+ return min(confidence, 100)
1237
 
1238
+ def validate_gs1_format(self, data):
1239
+ """Validate GS1 format for DataBar data"""
1240
+ try:
1241
+ # Check for GS1 Application Identifiers
1242
+ ai_pattern = r'\[(\d{2,4})\]'
1243
+ matches = re.findall(ai_pattern, data)
1244
+
1245
+ if matches:
1246
+ return True
1247
+
1248
+ # Check for parentheses format
1249
+ ai_pattern_parens = r'\((\d{2,4})\)'
1250
+ matches_parens = re.findall(ai_pattern_parens, data)
1251
+
1252
+ return len(matches_parens) > 0
1253
+
1254
+ except:
1255
+ return False
1256
 
1257
  def check_spelling(self, text):
1258
  """
 
1263
  - Flags if unknown in its likely language (not both)
1264
  """
1265
  try:
1266
+ # normalize ligatures & curly quotes
1267
  text = unicodedata.normalize("NFKC", text)
1268
  text = text.replace("'", "'").replace(""", '"').replace(""", '"')
1269
 
1270
+ # unicode letters with internal ' or - allowed
1271
  tokens = _re.findall(TOKEN_PATTERN, text, flags=_re.UNICODE if _USE_REGEX else 0)
1272
 
1273
  issues = []
 
1277
  # skip very short, short ALL-CAPS acronyms, and whitelisted terms
1278
  if len(t) < 3:
1279
  continue
1280
+ if raw.isupper() and len(raw) <= 3:
1281
  continue
1282
  if t in DOMAIN_WHITELIST:
1283
  continue
 
1285
  miss_en = t in self.english_spellchecker.unknown([t])
1286
  miss_fr = t in self.french_spellchecker.unknown([t])
1287
 
1288
+ use_fr = _likely_french(raw)
1289
 
1290
  # Prefer the likely language, but fall back to "either language unknown"
1291
  if (use_fr and miss_fr) or ((not use_fr) and miss_en) or (miss_en and miss_fr):
 
1301
  print(f"Error checking spelling: {e}")
1302
  return []
1303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1304
  def compare_colors(self, image1, image2):
1305
+ """Compare colors between two images and return differences using RGB color space"""
1306
  try:
1307
+ print("Starting RGB color comparison...")
1308
+
1309
  # Convert images to same size
1310
  img1 = np.array(image1)
1311
  img2 = np.array(image2)
1312
 
1313
+ print(f"Image 1 shape: {img1.shape}")
1314
+ print(f"Image 2 shape: {img2.shape}")
1315
+
1316
  # Resize images to same dimensions
1317
  height = min(img1.shape[0], img2.shape[0])
1318
  width = min(img1.shape[1], img2.shape[1])
 
1320
  img1_resized = cv2.resize(img1, (width, height))
1321
  img2_resized = cv2.resize(img2, (width, height))
1322
 
1323
+ print(f"Resized to: {width}x{height}")
1324
+
1325
+ # Keep images in RGB format (no conversion to BGR)
1326
+ img1_rgb = img1_resized
1327
+ img2_rgb = img2_resized
1328
 
1329
+ color_differences = []
 
1330
 
1331
+ # Method 1: Enhanced RGB channel comparison with 20% more accuracy
1332
+ print("Method 1: Enhanced RGB channel comparison")
 
1333
 
1334
+ # Calculate absolute difference for each RGB channel with enhanced precision
1335
+ diff_r = cv2.absdiff(img1_rgb[:,:,0], img2_rgb[:,:,0]) # Red channel
1336
+ diff_g = cv2.absdiff(img1_rgb[:,:,1], img2_rgb[:,:,1]) # Green channel
1337
+ diff_b = cv2.absdiff(img1_rgb[:,:,2], img2_rgb[:,:,2]) # Blue channel
1338
 
1339
+ # Enhanced RGB combination with better weighting
1340
+ diff_combined = cv2.addWeighted(diff_r, 0.4, diff_g, 0.4, 0) # Red and Green weighted higher
1341
+ diff_combined = cv2.addWeighted(diff_combined, 1.0, diff_b, 0.2, 0) # Blue weighted lower
1342
+
1343
+ # Apply Gaussian blur to reduce noise and improve accuracy
1344
+ diff_combined = cv2.GaussianBlur(diff_combined, (3, 3), 0)
1345
+
1346
+ # Apply balanced thresholds to catch color variations while avoiding multiple boxes
1347
+ rgb_thresholds = [15, 22, 30, 40] # Balanced thresholds
1348
+
1349
+ for threshold in rgb_thresholds:
1350
+ _, thresh = cv2.threshold(diff_combined, threshold, 255, cv2.THRESH_BINARY)
1351
+
1352
+ # Apply minimal morphological operations
1353
+ kernel = np.ones((1, 1), np.uint8) # Minimal kernel to preserve detail
1354
+ thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
1355
+ thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
1356
+
1357
+ # Find contours
1358
+ contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
1359
+
1360
+ print(f"RGB Threshold {threshold}: Found {len(contours)} contours")
1361
+
1362
+ for contour in contours:
1363
+ area = cv2.contourArea(contour)
1364
+ if area > 15: # Balanced area threshold to catch variations while avoiding small boxes
1365
+ x, y, w, h = cv2.boundingRect(contour)
1366
+
1367
+ # Get the actual RGB colors at this location
1368
+ color1 = img1_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
1369
+ color2 = img2_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
1370
+
1371
+ # Calculate RGB color difference magnitude
1372
+ color_diff = np.linalg.norm(color1 - color2)
1373
+
1374
+ # Flag moderate color differences
1375
+ if color_diff > 18: # Balanced threshold
1376
+ # Check if this area is already covered (refined consolidated problem areas)
1377
+ already_covered = False
1378
+ for existing_diff in color_differences:
1379
+ if (abs(existing_diff['x'] - x) < 21 and
1380
+ abs(existing_diff['y'] - y) < 21 and
1381
+ abs(existing_diff['width'] - w) < 21 and
1382
+ abs(existing_diff['height'] - h) < 21):
1383
+ already_covered = True
1384
+ break
1385
+
1386
+ if not already_covered:
1387
+ color_differences.append({
1388
+ 'x': x,
1389
+ 'y': y,
1390
+ 'width': w,
1391
+ 'height': h,
1392
+ 'area': area,
1393
+ 'color1': color1.tolist(),
1394
+ 'color2': color2.tolist(),
1395
+ 'threshold': f"RGB_{threshold}",
1396
+ 'color_diff': color_diff,
1397
+ 'diff_r': float(abs(color1[0] - color2[0])),
1398
+ 'diff_g': float(abs(color1[1] - color2[1])),
1399
+ 'diff_b': float(abs(color1[2] - color2[2]))
1400
+ })
1401
+
1402
+ # Method 2: Enhanced HSV color space comparison with 20% more accuracy
1403
+ print("Method 2: Enhanced HSV color space comparison")
1404
+
1405
+ # Convert to HSV for better color difference detection
1406
+ img1_hsv = cv2.cvtColor(img1_rgb, cv2.COLOR_RGB2HSV)
1407
+ img2_hsv = cv2.cvtColor(img2_rgb, cv2.COLOR_RGB2HSV)
1408
+
1409
+ # Enhanced HSV comparison with better channel weighting
1410
+ hue_diff = cv2.absdiff(img1_hsv[:,:,0], img2_hsv[:,:,0]) # Hue channel
1411
+ sat_diff = cv2.absdiff(img1_hsv[:,:,1], img2_hsv[:,:,1]) # Saturation channel
1412
+ val_diff = cv2.absdiff(img1_hsv[:,:,2], img2_hsv[:,:,2]) # Value channel
1413
+
1414
+ # Enhanced HSV combination with better weighting
1415
+ hsv_combined = cv2.addWeighted(hue_diff, 0.5, sat_diff, 0.3, 0) # Hue and Saturation
1416
+ hsv_combined = cv2.addWeighted(hsv_combined, 1.0, val_diff, 0.2, 0) # Add Value channel
1417
+
1418
+ # Apply Gaussian blur to reduce noise and improve accuracy
1419
+ hsv_combined = cv2.GaussianBlur(hsv_combined, (3, 3), 0)
1420
+
1421
+ # Apply balanced HSV thresholds to catch color variations while avoiding multiple boxes
1422
+ hsv_thresholds = [18, 25, 35, 45] # Balanced HSV thresholds
1423
+
1424
+ for threshold in hsv_thresholds:
1425
+ _, hsv_thresh = cv2.threshold(hsv_combined, threshold, 255, cv2.THRESH_BINARY)
1426
+
1427
+ # Apply minimal morphological operations
1428
+ kernel = np.ones((1, 1), np.uint8)
1429
+ hsv_thresh = cv2.morphologyEx(hsv_thresh, cv2.MORPH_CLOSE, kernel)
1430
+ hsv_thresh = cv2.morphologyEx(hsv_thresh, cv2.MORPH_OPEN, kernel)
1431
+
1432
+ # Find contours
1433
+ hsv_contours, _ = cv2.findContours(hsv_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
1434
+
1435
+ print(f"HSV Threshold {threshold}: Found {len(hsv_contours)} contours")
1436
+
1437
+ for contour in hsv_contours:
1438
+ area = cv2.contourArea(contour)
1439
+ if area > 15: # Balanced area threshold to catch variations while avoiding small boxes
1440
+ x, y, w, h = cv2.boundingRect(contour)
1441
+
1442
+ # Get the actual colors at this location
1443
+ color1 = img1_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
1444
+ color2 = img2_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
1445
+
1446
+ # Calculate color difference magnitude
1447
+ color_diff = np.linalg.norm(color1 - color2)
1448
+
1449
+ # Flag moderate color differences
1450
+ if color_diff > 22: # Balanced threshold
1451
+ # Check if this area is already covered (refined consolidated problem areas)
1452
+ already_covered = False
1453
+ for existing_diff in color_differences:
1454
+ if (abs(existing_diff['x'] - x) < 21 and
1455
+ abs(existing_diff['y'] - y) < 21 and
1456
+ abs(existing_diff['width'] - w) < 21 and
1457
+ abs(existing_diff['height'] - h) < 21):
1458
+ already_covered = True
1459
+ break
1460
+
1461
+ if not already_covered:
1462
+ color_differences.append({
1463
+ 'x': x,
1464
+ 'y': y,
1465
+ 'width': w,
1466
+ 'height': h,
1467
+ 'area': area,
1468
+ 'color1': color1.tolist(),
1469
+ 'color2': color2.tolist(),
1470
+ 'threshold': f"HSV_{threshold}",
1471
+ 'color_diff': color_diff,
1472
+ 'diff_r': float(abs(color1[0] - color2[0])),
1473
+ 'diff_g': float(abs(color1[1] - color2[1])),
1474
+ 'diff_b': float(abs(color1[2] - color2[2]))
1475
+ })
1476
+
1477
+ # Method 3: Enhanced pixel-by-pixel RGB comparison with 20% more accuracy
1478
+ print("Method 3: Enhanced pixel-by-pixel RGB comparison")
1479
+
1480
+ # Sample every 12th pixel for less sensitivity (20% less frequent)
1481
+ for y in range(0, height, 12):
1482
+ for x in range(0, width, 12):
1483
+ color1 = img1_rgb[y, x]
1484
+ color2 = img2_rgb[y, x]
1485
+
1486
+ # Calculate absolute difference for each RGB channel
1487
+ diff_r = abs(int(color1[0]) - int(color2[0])) # Red channel
1488
+ diff_g = abs(int(color1[1]) - int(color2[1])) # Green channel
1489
+ diff_b = abs(int(color1[2]) - int(color2[2])) # Blue channel
1490
+
1491
+ # Flag if RGB channels differ by moderate amounts
1492
+ if diff_r > 10 or diff_g > 10 or diff_b > 10:
1493
+ # Check if this area is already covered (refined consolidated problem areas)
1494
+ already_covered = False
1495
+ for existing_diff in color_differences:
1496
+ if (abs(existing_diff['x'] - x) < 21 and
1497
+ abs(existing_diff['y'] - y) < 21):
1498
+ already_covered = True
1499
+ break
1500
+
1501
+ if not already_covered:
1502
+ color_differences.append({
1503
+ 'x': x,
1504
+ 'y': y,
1505
+ 'width': 5, # Small box around the pixel
1506
+ 'height': 5,
1507
+ 'area': 25,
1508
+ 'color1': color1.tolist(),
1509
+ 'color2': color2.tolist(),
1510
+ 'threshold': 'pixel_RGB',
1511
+ 'color_diff': diff_r + diff_g + diff_b,
1512
+ 'diff_r': diff_r,
1513
+ 'diff_g': diff_g,
1514
+ 'diff_b': diff_b
1515
+ })
1516
+
1517
+ print(f"RGB color comparison completed. Found {len(color_differences)} total differences.")
1518
+
1519
+ # Method 4: LAB color space comparison for perceptual accuracy (20% more accurate)
1520
+ print("Method 4: LAB color space comparison")
1521
+
1522
+ # Convert to LAB color space for perceptual color differences
1523
+ img1_lab = cv2.cvtColor(img1_rgb, cv2.COLOR_RGB2LAB)
1524
+ img2_lab = cv2.cvtColor(img2_rgb, cv2.COLOR_RGB2LAB)
1525
+
1526
+ # Calculate LAB differences (perceptually uniform)
1527
+ lab_diff_l = cv2.absdiff(img1_lab[:,:,0], img2_lab[:,:,0]) # L channel (lightness)
1528
+ lab_diff_a = cv2.absdiff(img1_lab[:,:,1], img2_lab[:,:,1]) # a channel (green-red)
1529
+ lab_diff_b = cv2.absdiff(img1_lab[:,:,2], img2_lab[:,:,2]) # b channel (blue-yellow)
1530
+
1531
+ # Combine LAB differences with perceptual weighting
1532
+ lab_combined = cv2.addWeighted(lab_diff_l, 0.3, lab_diff_a, 0.35, 0) # L and a channels
1533
+ lab_combined = cv2.addWeighted(lab_combined, 1.0, lab_diff_b, 0.35, 0) # Add b channel
1534
+
1535
+ # Apply Gaussian blur for noise reduction
1536
+ lab_combined = cv2.GaussianBlur(lab_combined, (3, 3), 0)
1537
+
1538
+ # Apply balanced LAB thresholds to catch color variations while avoiding multiple boxes
1539
+ lab_thresholds = [20, 28, 38, 50] # Balanced LAB thresholds
1540
+
1541
+ for threshold in lab_thresholds:
1542
+ _, lab_thresh = cv2.threshold(lab_combined, threshold, 255, cv2.THRESH_BINARY)
1543
+
1544
+ # Apply morphological operations
1545
+ kernel = np.ones((1, 1), np.uint8)
1546
+ lab_thresh = cv2.morphologyEx(lab_thresh, cv2.MORPH_CLOSE, kernel)
1547
+ lab_thresh = cv2.morphologyEx(lab_thresh, cv2.MORPH_OPEN, kernel)
1548
+
1549
+ # Find contours
1550
+ lab_contours, _ = cv2.findContours(lab_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
1551
+
1552
+ print(f"LAB Threshold {threshold}: Found {len(lab_contours)} contours")
1553
+
1554
+ for contour in lab_contours:
1555
+ area = cv2.contourArea(contour)
1556
+ if area > 15: # Balanced area threshold to catch variations while avoiding small boxes
1557
+ x, y, w, h = cv2.boundingRect(contour)
1558
+
1559
+ # Get the actual colors at this location
1560
+ color1 = img1_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
1561
+ color2 = img2_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
1562
+
1563
+ # Calculate color difference magnitude
1564
+ color_diff = np.linalg.norm(color1 - color2)
1565
+
1566
+ # Flag moderate color differences
1567
+ if color_diff > 22: # Balanced threshold
1568
+ # Check if this area is already covered (refined consolidated problem areas)
1569
+ already_covered = False
1570
+ for existing_diff in color_differences:
1571
+ if (abs(existing_diff['x'] - x) < 21 and
1572
+ abs(existing_diff['y'] - y) < 21 and
1573
+ abs(existing_diff['width'] - w) < 21 and
1574
+ abs(existing_diff['height'] - h) < 21):
1575
+ already_covered = True
1576
+ break
1577
+
1578
+ if not already_covered:
1579
+ color_differences.append({
1580
+ 'x': x,
1581
+ 'y': y,
1582
+ 'width': w,
1583
+ 'height': h,
1584
+ 'area': area,
1585
+ 'color1': color1.tolist(),
1586
+ 'color2': color2.tolist(),
1587
+ 'threshold': f"LAB_{threshold}",
1588
+ 'color_diff': color_diff,
1589
+ 'diff_r': float(abs(color1[0] - color2[0])),
1590
+ 'diff_g': float(abs(color1[1] - color2[1])),
1591
+ 'diff_b': float(abs(color1[2] - color2[2]))
1592
+ })
1593
+
1594
+ print(f"Enhanced color comparison completed. Found {len(color_differences)} total differences.")
1595
+
1596
+ # Group nearby differences into one perimeter box per issue area
1597
+ if color_differences:
1598
+ grouped_differences = self.group_nearby_differences(color_differences)
1599
+ print(f"Grouped into {len(grouped_differences)} perimeter boxes")
1600
+ return grouped_differences
1601
 
1602
  return color_differences
1603
 
 
1605
  print(f"Error comparing colors: {str(e)}")
1606
  return []
1607
 
1608
+ def group_nearby_differences(self, differences):
1609
+ """Group nearby differences into larger bounding boxes around affected areas"""
1610
+ if not differences:
1611
+ return []
1612
+
1613
+ # Sort differences by position for easier grouping
1614
+ sorted_diffs = sorted(differences, key=lambda x: (x['y'], x['x']))
1615
+
1616
+ grouped_areas = []
1617
+ current_group = []
1618
+
1619
+ for diff in sorted_diffs:
1620
+ if not current_group:
1621
+ current_group = [diff]
1622
+ else:
1623
+ # Check if this difference is close to the current group
1624
+ should_group = False
1625
+ for group_diff in current_group:
1626
+ # Calculate distance between centers
1627
+ center1_x = group_diff['x'] + group_diff['width'] // 2
1628
+ center1_y = group_diff['y'] + group_diff['height'] // 2
1629
+ center2_x = diff['x'] + diff['width'] // 2
1630
+ center2_y = diff['y'] + diff['height'] // 2
1631
+
1632
+ distance = ((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2) ** 0.5
1633
+
1634
+ # If distance is less than 200 pixels, group them for one box per main issue
1635
+ if distance < 200:
1636
+ should_group = True
1637
+ break
1638
+
1639
+ if should_group:
1640
+ current_group.append(diff)
1641
+ else:
1642
+ # Create bounding box for current group
1643
+ if current_group:
1644
+ bounding_box = self.create_group_bounding_box(current_group)
1645
+ if bounding_box: # Only add if not None
1646
+ grouped_areas.append(bounding_box)
1647
+ current_group = [diff]
1648
+
1649
+ # Don't forget the last group
1650
+ if current_group:
1651
+ bounding_box = self.create_group_bounding_box(current_group)
1652
+ if bounding_box: # Only add if not None
1653
+ grouped_areas.append(bounding_box)
1654
+
1655
+ return grouped_areas
1656
+
1657
+ def group_nearby_differences(self, differences):
1658
+ """Group nearby differences into one perimeter box per issue area"""
1659
+ if not differences:
1660
+ return []
1661
+
1662
+ # Sort differences by position for easier grouping
1663
+ sorted_diffs = sorted(differences, key=lambda x: (x['y'], x['x']))
1664
+
1665
+ grouped_areas = []
1666
+ current_group = []
1667
+
1668
+ for diff in sorted_diffs:
1669
+ if not current_group:
1670
+ current_group = [diff]
1671
+ else:
1672
+ # Check if this difference is close to the current group
1673
+ should_group = False
1674
+ for group_diff in current_group:
1675
+ # Calculate distance between centers
1676
+ center1_x = group_diff['x'] + group_diff['width'] // 2
1677
+ center1_y = group_diff['y'] + group_diff['height'] // 2
1678
+ center2_x = diff['x'] + diff['width'] // 2
1679
+ center2_y = diff['y'] + diff['height'] // 2
1680
+
1681
+ distance = ((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2) ** 0.5
1682
+
1683
+ # If distance is less than 234 pixels, group them for refined consolidated problem areas
1684
+ if distance < 234:
1685
+ should_group = True
1686
+ break
1687
+
1688
+ if should_group:
1689
+ current_group.append(diff)
1690
+ else:
1691
+ # Create perimeter box for current group
1692
+ if current_group:
1693
+ perimeter_box = self.create_perimeter_box(current_group)
1694
+ if perimeter_box: # Only add if not None
1695
+ grouped_areas.append(perimeter_box)
1696
+ current_group = [diff]
1697
+
1698
+ # Don't forget the last group
1699
+ if current_group:
1700
+ perimeter_box = self.create_perimeter_box(current_group)
1701
+ if perimeter_box: # Only add if not None
1702
+ grouped_areas.append(perimeter_box)
1703
+
1704
+ return grouped_areas
1705
+
1706
+ def create_perimeter_box(self, group):
1707
+ """Create a perimeter box that encompasses all differences in a group"""
1708
+ if not group:
1709
+ return None
1710
+
1711
+ # Find the overall bounding box
1712
+ min_x = min(diff['x'] - 5 for diff in group) # Include 5-pixel extension
1713
+ min_y = min(diff['y'] - 5 for diff in group) # Include 5-pixel extension
1714
+ max_x = max(diff['x'] + diff['width'] + 5 for diff in group) # Include 5-pixel extension
1715
+ max_y = max(diff['y'] + diff['height'] + 5 for diff in group) # Include 5-pixel extension
1716
+
1717
+ # Add minimal padding around the perimeter box (refined consolidated problem areas)
1718
+ padding = 7
1719
+ min_x = max(0, min_x - padding)
1720
+ min_y = max(0, min_y - padding)
1721
+ max_x = max_x + padding
1722
+ max_y = max_y + padding
1723
+
1724
+ # Calculate final dimensions
1725
+ width = max_x - min_x
1726
+ height = max_y - min_y
1727
+
1728
+ # Filter out very small groups (refined consolidated problem areas)
1729
+ if width < 26 or height < 26:
1730
+ return None
1731
+
1732
+ return {
1733
+ 'x': min_x,
1734
+ 'y': min_y,
1735
+ 'width': width,
1736
+ 'height': height,
1737
+ 'area': width * height,
1738
+ 'color1': [0, 0, 0], # Placeholder
1739
+ 'color2': [0, 0, 0], # Placeholder
1740
+ 'threshold': 'perimeter',
1741
+ 'color_diff': 1.0,
1742
+ 'num_original_differences': len(group)
1743
+ }
1744
+
1745
  def create_annotated_image(self, image, differences, output_path):
1746
  """Create annotated image with red boxes around differences"""
1747
  try:
1748
+ print(f"Creating annotated image: {output_path}")
1749
+ print(f"Number of differences to annotate: {len(differences)}")
1750
+
1751
  # Create a copy of the image
1752
  annotated_image = image.copy()
1753
  draw = ImageDraw.Draw(annotated_image)
1754
 
1755
  # Draw red rectangles around differences
1756
+ for i, diff in enumerate(differences):
1757
  x, y, w, h = diff['x'], diff['y'], diff['width'], diff['height']
1758
+
1759
+ # Draw thicker red rectangle
1760
+ draw.rectangle([x, y, x + w, y + h], outline='red', width=5)
1761
+
1762
+ print(f"Drawing rectangle {i+1}: ({x}, {y}) to ({x+w}, {y+h})")
1763
 
1764
  # Save annotated image
1765
  annotated_image.save(output_path)
1766
+ print(f"Annotated image saved successfully: {output_path}")
1767
 
1768
  except Exception as e:
1769
  print(f"Error creating annotated image: {str(e)}")
1770
+ # Try to save the original image as fallback
1771
+ try:
1772
+ image.save(output_path)
1773
+ print(f"Saved original image as fallback: {output_path}")
1774
+ except Exception as e2:
1775
+ print(f"Failed to save fallback image: {str(e2)}")
1776
 
1777
  def compare_pdfs(self, pdf1_path, pdf2_path, session_id):
1778
+ """Main comparison function with improved error handling"""
1779
  try:
1780
+ print("Starting PDF comparison...")
1781
+ start_time = time.time()
1782
+
1783
  # Validate both PDFs contain "50 Carroll"
1784
+ print("Validating PDF 1...")
1785
  if not self.validate_pdf(pdf1_path):
1786
  raise Exception("INVALID DOCUMENT")
1787
 
1788
+ print("Validating PDF 2...")
1789
  if not self.validate_pdf(pdf2_path):
1790
  raise Exception("INVALID DOCUMENT")
1791
 
1792
  # Extract text and images from both PDFs
1793
+ print("Extracting text from PDF 1...")
1794
  pdf1_data = self.extract_text_from_pdf(pdf1_path)
1795
+ if not pdf1_data:
1796
+ raise Exception("INVALID DOCUMENT")
1797
+
1798
+ print("Extracting text from PDF 2...")
1799
  pdf2_data = self.extract_text_from_pdf(pdf2_path)
1800
+ if not pdf2_data:
1801
+ raise Exception("INVALID DOCUMENT")
1802
 
1803
  # Initialize results
1804
  results = {
 
1816
  }
1817
 
1818
  # Compare text and check spelling
1819
+ print("Processing pages...")
1820
  for i, (page1, page2) in enumerate(zip(pdf1_data, pdf2_data)):
1821
+ print(f"Processing page {i + 1}...")
1822
  page_results = {
1823
  'page': i + 1,
1824
  'text_differences': [],
 
1830
  }
1831
 
1832
  # Check spelling for both PDFs
1833
+ print(f"Checking spelling for page {i + 1}...")
1834
  page_results['spelling_issues_pdf1'] = self.check_spelling(page1['text'])
1835
  page_results['spelling_issues_pdf2'] = self.check_spelling(page2['text'])
1836
 
1837
+ # Add spelling issues to text differences for UI visibility
1838
+ if page_results['spelling_issues_pdf1'] or page_results['spelling_issues_pdf2']:
1839
+ page_results['text_differences'].append({
1840
+ "type": "spelling",
1841
+ "pdf1": [i["word"] for i in page_results['spelling_issues_pdf1']],
1842
+ "pdf2": [i["word"] for i in page_results['spelling_issues_pdf2']],
1843
+ })
1844
+
1845
  # Create spelling-only annotated images (one box per error)
1846
  spell_dir = f'static/results/{session_id}'
1847
  os.makedirs(spell_dir, exist_ok=True)
1848
+
1849
  spell_img1 = page1['image'].copy()
1850
  spell_img2 = page2['image'].copy()
1851
  spell_img1 = self.annotate_spelling_errors_on_image(spell_img1, page_results['spelling_issues_pdf1'])
1852
  spell_img2 = self.annotate_spelling_errors_on_image(spell_img2, page_results['spelling_issues_pdf2'])
1853
+
1854
  spell_path1 = f'{spell_dir}/page_{i+1}_pdf1_spelling.png'
1855
  spell_path2 = f'{spell_dir}/page_{i+1}_pdf2_spelling.png'
1856
  spell_img1.save(spell_path1)
1857
  spell_img2.save(spell_path2)
1858
+
1859
+ # link them into the results for your UI
1860
+ page_results.setdefault('annotated_images', {})
1861
+ page_results['annotated_images'].update({
1862
+ 'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
1863
+ 'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png',
1864
+ })
1865
 
1866
  # Detect barcodes and QR codes
1867
+ print(f"Detecting barcodes for page {i + 1} PDF 1...")
1868
+ page_results['barcodes_pdf1'] = self.detect_barcodes_qr_codes(page1['image']) or []
1869
+
1870
+ print(f"Detecting barcodes for page {i + 1} PDF 2...")
1871
+ page_results['barcodes_pdf2'] = self.detect_barcodes_qr_codes(page2['image']) or []
1872
 
1873
  # Compare colors
1874
+ print(f"Comparing colors for page {i + 1}...")
1875
  color_diffs = self.compare_colors(page1['image'], page2['image'])
1876
  page_results['color_differences'] = color_diffs
1877
 
1878
+ # Create annotated images and save original images
1879
+ print(f"Creating images for page {i + 1}...")
1880
+ output_dir = f'static/results/{session_id}'
1881
+ os.makedirs(output_dir, exist_ok=True)
1882
+
1883
+ # Save original images
1884
+ original_path1 = f'{output_dir}/page_{i+1}_pdf1_original.png'
1885
+ original_path2 = f'{output_dir}/page_{i+1}_pdf2_original.png'
1886
+
1887
+ page1['image'].save(original_path1)
1888
+ page2['image'].save(original_path2)
1889
+
1890
+ # Create annotated images if there are color differences
1891
  if color_diffs:
1892
+ print(f"Creating annotated images for page {i + 1}...")
 
 
1893
  annotated_path1 = f'{output_dir}/page_{i+1}_pdf1_annotated.png'
1894
  annotated_path2 = f'{output_dir}/page_{i+1}_pdf2_annotated.png'
1895
 
 
1898
 
1899
  page_results['annotated_images'] = {
1900
  'pdf1': f'results/{session_id}/page_{i+1}_pdf1_annotated.png',
1901
+ 'pdf2': f'results/{session_id}/page_{i+1}_pdf2_annotated.png'
 
 
1902
  }
1903
  else:
1904
+ # If no color differences, use original images
1905
  page_results['annotated_images'] = {
1906
+ 'pdf1': f'results/{session_id}/page_{i+1}_pdf1_original.png',
1907
+ 'pdf2': f'results/{session_id}/page_{i+1}_pdf2_original.png'
1908
  }
1909
 
 
 
 
 
 
 
 
 
 
 
 
 
1910
  results['text_comparison'].append(page_results)
1911
 
1912
  # Aggregate spelling issues
1913
+ print("Aggregating results...")
1914
  all_spelling_issues = []
1915
  for page in results['text_comparison']:
1916
  all_spelling_issues.extend(page['spelling_issues_pdf1'])
 
1926
 
1927
  results['barcodes_qr_codes'] = all_barcodes
1928
 
1929
+ elapsed_time = time.time() - start_time
1930
+ print(f"PDF comparison completed in {elapsed_time:.2f} seconds.")
1931
+
1932
  return results
1933
 
1934
  except Exception as e:
1935
+ print(f"Error in PDF comparison: {str(e)}")
1936
+ raise Exception(f"INVALID DOCUMENT")
1937
+ # Enhanced OCR for tiny fonts - deployment check
1938
+ # Force rebuild - Thu Sep 4 09:33:44 EDT 2025