Yaz Hobooti commited on
Commit
36496e9
·
1 Parent(s): 0533111

Add validation to check for '50 carroll' text in PDFs before analysis

Browse files
Files changed (1) hide show
  1. app.py +35 -13
app.py CHANGED
@@ -41,8 +41,8 @@ except Exception:
41
  from pyspellchecker import SpellChecker
42
  HAS_SPELLCHECK = True
43
  except Exception:
44
- SpellChecker = None
45
- HAS_SPELLCHECK = False
46
 
47
  try:
48
  import regex as re
@@ -639,7 +639,7 @@ import unicodedata
639
  import regex as re
640
  import pytesseract
641
  try:
642
- from spellchecker import SpellChecker
643
  except ImportError:
644
  try:
645
  from pyspellchecker import SpellChecker
@@ -1098,19 +1098,19 @@ def _decode_zxing_all(pil: Image.Image) -> List[Dict[str, Any]]:
1098
  zx = zxingcpp.read_barcodes(arr)
1099
  for r in zx or []:
1100
  x1=y1=w=h=0
1101
- pos = getattr(r, "position", None)
1102
  pts=[]
1103
- if pos is not None:
1104
- try:
1105
  pts=list(pos)
1106
- except TypeError:
1107
  for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight",
1108
  "point1","point2","point3","point4"):
1109
- if hasattr(pos, name):
1110
  p=getattr(pos,name)
1111
  if hasattr(p,"x") and hasattr(p,"y"):
1112
- pts.append(p)
1113
- if pts:
1114
  xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts]
1115
  x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1
1116
  results.append({
@@ -1135,12 +1135,12 @@ def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]:
1135
  out=[]
1136
  for d in res:
1137
  data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)
1138
- out.append({
1139
  "type": d.type, "data": data,
1140
  "left": d.rect.left, "top": d.rect.top,
1141
  "width": d.rect.width, "height": d.rect.height
1142
- })
1143
- return out
1144
  except Exception:
1145
  return []
1146
 
@@ -1307,12 +1307,34 @@ def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns',
1307
  return out
1308
 
1309
  # -------------------- Gradio Interface -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1310
  def compare_pdfs(file_a, file_b):
1311
  """Main comparison function for Gradio interface"""
1312
  try:
1313
  if file_a is None or file_b is None:
1314
  return None, None, None, "❌ Please upload both PDF files to compare", [], []
1315
 
 
 
 
 
1316
  # Load images with multiple pages support
1317
  pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
1318
  pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)
 
41
  from pyspellchecker import SpellChecker
42
  HAS_SPELLCHECK = True
43
  except Exception:
44
+ SpellChecker = None
45
+ HAS_SPELLCHECK = False
46
 
47
  try:
48
  import regex as re
 
639
  import regex as re
640
  import pytesseract
641
  try:
642
+ from spellchecker import SpellChecker
643
  except ImportError:
644
  try:
645
  from pyspellchecker import SpellChecker
 
1098
  zx = zxingcpp.read_barcodes(arr)
1099
  for r in zx or []:
1100
  x1=y1=w=h=0
1101
+ pos = getattr(r, "position", None)
1102
  pts=[]
1103
+ if pos is not None:
1104
+ try:
1105
  pts=list(pos)
1106
+ except TypeError:
1107
  for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight",
1108
  "point1","point2","point3","point4"):
1109
+ if hasattr(pos, name):
1110
  p=getattr(pos,name)
1111
  if hasattr(p,"x") and hasattr(p,"y"):
1112
+ pts.append(p)
1113
+ if pts:
1114
  xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts]
1115
  x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1
1116
  results.append({
 
1135
  out=[]
1136
  for d in res:
1137
  data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)
1138
+ out.append({
1139
  "type": d.type, "data": data,
1140
  "left": d.rect.left, "top": d.rect.top,
1141
  "width": d.rect.width, "height": d.rect.height
1142
+ })
1143
+ return out
1144
  except Exception:
1145
  return []
1146
 
 
1307
  return out
1308
 
1309
  # -------------------- Gradio Interface -----------------
1310
+ def _contains_50_carroll(pdf_path: str) -> bool:
1311
+ """Check if PDF contains the text '50 carroll' (case insensitive)"""
1312
+ try:
1313
+ if not HAS_PYMUPDF:
1314
+ return True # Skip validation if PyMuPDF not available
1315
+
1316
+ doc = fitz.open(pdf_path)
1317
+ for page_num in range(min(len(doc), 5)): # Check first 5 pages
1318
+ page = doc[page_num]
1319
+ text = page.get_text().lower()
1320
+ if "50 carroll" in text:
1321
+ doc.close()
1322
+ return True
1323
+ doc.close()
1324
+ return False
1325
+ except Exception:
1326
+ return True # Skip validation on error
1327
+
1328
  def compare_pdfs(file_a, file_b):
1329
  """Main comparison function for Gradio interface"""
1330
  try:
1331
  if file_a is None or file_b is None:
1332
  return None, None, None, "❌ Please upload both PDF files to compare", [], []
1333
 
1334
+ # Check for "50 carroll" text in both files
1335
+ if not _contains_50_carroll(file_a.name) or not _contains_50_carroll(file_b.name):
1336
+ return None, None, None, "❌ Invalid File - Required text '50 carroll' not found", [], []
1337
+
1338
  # Load images with multiple pages support
1339
  pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
1340
  pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)