Spaces:

Digitaljoint
/

ProofCheck

Sleeping

App Files Files Community

Yaz Hobooti commited on Sep 28, 2025

Commit

36496e9

1 Parent(s): 0533111

Add validation to check for '50 carroll' text in PDFs before analysis

Browse files

Files changed (1) hide show

app.py +35 -13

app.py CHANGED Viewed

@@ -41,8 +41,8 @@ except Exception:
         from pyspellchecker import SpellChecker
         HAS_SPELLCHECK = True
     except Exception:
-        SpellChecker = None
-        HAS_SPELLCHECK = False
 try:
     import regex as re
@@ -639,7 +639,7 @@ import unicodedata
 import regex as re
 import pytesseract
 try:
-    from spellchecker import SpellChecker
 except ImportError:
     try:
         from pyspellchecker import SpellChecker
@@ -1098,19 +1098,19 @@ def _decode_zxing_all(pil: Image.Image) -> List[Dict[str, Any]]:
             zx = zxingcpp.read_barcodes(arr)
         for r in zx or []:
             x1=y1=w=h=0
-            pos = getattr(r, "position", None)
             pts=[]
-            if pos is not None:
-                try:
                     pts=list(pos)
-                except TypeError:
                     for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight",
                                  "point1","point2","point3","point4"):
-                        if hasattr(pos, name):
                             p=getattr(pos,name)
                             if hasattr(p,"x") and hasattr(p,"y"):
-                                pts.append(p)
-            if pts:
                 xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts]
                 x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1
             results.append({
@@ -1135,12 +1135,12 @@ def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]:
         out=[]
         for d in res:
             data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)
-            out.append({
                 "type": d.type, "data": data,
                 "left": d.rect.left, "top": d.rect.top,
                 "width": d.rect.width, "height": d.rect.height
-            })
-        return out
     except Exception:
         return []
@@ -1307,12 +1307,34 @@ def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns',
     return out
 # -------------------- Gradio Interface -----------------
 def compare_pdfs(file_a, file_b):
     """Main comparison function for Gradio interface"""
     try:
         if file_a is None or file_b is None:
             return None, None, None, "❌ Please upload both PDF files to compare", [], []
         # Load images with multiple pages support
         pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
         pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)

         from pyspellchecker import SpellChecker
         HAS_SPELLCHECK = True
     except Exception:
+    SpellChecker = None
+    HAS_SPELLCHECK = False
 try:
     import regex as re
 import regex as re
 import pytesseract
 try:
+from spellchecker import SpellChecker
 except ImportError:
     try:
         from pyspellchecker import SpellChecker
             zx = zxingcpp.read_barcodes(arr)
         for r in zx or []:
             x1=y1=w=h=0
+        pos = getattr(r, "position", None)
             pts=[]
+        if pos is not None:
+            try:
                     pts=list(pos)
+            except TypeError:
                     for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight",
                                  "point1","point2","point3","point4"):
+                    if hasattr(pos, name):
                             p=getattr(pos,name)
                             if hasattr(p,"x") and hasattr(p,"y"):
+                            pts.append(p)
+        if pts:
                 xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts]
                 x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1
             results.append({
         out=[]
         for d in res:
             data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)
+        out.append({
                 "type": d.type, "data": data,
                 "left": d.rect.left, "top": d.rect.top,
                 "width": d.rect.width, "height": d.rect.height
+        })
+    return out
     except Exception:
         return []
     return out
 # -------------------- Gradio Interface -----------------
+def _contains_50_carroll(pdf_path: str) -> bool:
+    """Check if PDF contains the text '50 carroll' (case insensitive)"""
+    try:
+        if not HAS_PYMUPDF:
+            return True  # Skip validation if PyMuPDF not available
+        doc = fitz.open(pdf_path)
+        for page_num in range(min(len(doc), 5)):  # Check first 5 pages
+            page = doc[page_num]
+            text = page.get_text().lower()
+            if "50 carroll" in text:
+                doc.close()
+                return True
+        doc.close()
+        return False
+    except Exception:
+        return True  # Skip validation on error
 def compare_pdfs(file_a, file_b):
     """Main comparison function for Gradio interface"""
     try:
         if file_a is None or file_b is None:
             return None, None, None, "❌ Please upload both PDF files to compare", [], []
+        # Check for "50 carroll" text in both files
+        if not _contains_50_carroll(file_a.name) or not _contains_50_carroll(file_b.name):
+            return None, None, None, "❌ Invalid File - Required text '50 carroll' not found", [], []
         # Load images with multiple pages support
         pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
         pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)