Spaces:
Sleeping
Sleeping
Yaz Hobooti
commited on
Commit
·
36496e9
1
Parent(s):
0533111
Add validation to check for '50 carroll' text in PDFs before analysis
Browse files
app.py
CHANGED
|
@@ -41,8 +41,8 @@ except Exception:
|
|
| 41 |
from pyspellchecker import SpellChecker
|
| 42 |
HAS_SPELLCHECK = True
|
| 43 |
except Exception:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
|
| 47 |
try:
|
| 48 |
import regex as re
|
|
@@ -639,7 +639,7 @@ import unicodedata
|
|
| 639 |
import regex as re
|
| 640 |
import pytesseract
|
| 641 |
try:
|
| 642 |
-
|
| 643 |
except ImportError:
|
| 644 |
try:
|
| 645 |
from pyspellchecker import SpellChecker
|
|
@@ -1098,19 +1098,19 @@ def _decode_zxing_all(pil: Image.Image) -> List[Dict[str, Any]]:
|
|
| 1098 |
zx = zxingcpp.read_barcodes(arr)
|
| 1099 |
for r in zx or []:
|
| 1100 |
x1=y1=w=h=0
|
| 1101 |
-
|
| 1102 |
pts=[]
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
pts=list(pos)
|
| 1106 |
-
|
| 1107 |
for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight",
|
| 1108 |
"point1","point2","point3","point4"):
|
| 1109 |
-
|
| 1110 |
p=getattr(pos,name)
|
| 1111 |
if hasattr(p,"x") and hasattr(p,"y"):
|
| 1112 |
-
|
| 1113 |
-
|
| 1114 |
xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts]
|
| 1115 |
x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1
|
| 1116 |
results.append({
|
|
@@ -1135,12 +1135,12 @@ def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]:
|
|
| 1135 |
out=[]
|
| 1136 |
for d in res:
|
| 1137 |
data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)
|
| 1138 |
-
|
| 1139 |
"type": d.type, "data": data,
|
| 1140 |
"left": d.rect.left, "top": d.rect.top,
|
| 1141 |
"width": d.rect.width, "height": d.rect.height
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
except Exception:
|
| 1145 |
return []
|
| 1146 |
|
|
@@ -1307,12 +1307,34 @@ def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns',
|
|
| 1307 |
return out
|
| 1308 |
|
| 1309 |
# -------------------- Gradio Interface -----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1310 |
def compare_pdfs(file_a, file_b):
|
| 1311 |
"""Main comparison function for Gradio interface"""
|
| 1312 |
try:
|
| 1313 |
if file_a is None or file_b is None:
|
| 1314 |
return None, None, None, "❌ Please upload both PDF files to compare", [], []
|
| 1315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1316 |
# Load images with multiple pages support
|
| 1317 |
pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
|
| 1318 |
pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)
|
|
|
|
| 41 |
from pyspellchecker import SpellChecker
|
| 42 |
HAS_SPELLCHECK = True
|
| 43 |
except Exception:
|
| 44 |
+
SpellChecker = None
|
| 45 |
+
HAS_SPELLCHECK = False
|
| 46 |
|
| 47 |
try:
|
| 48 |
import regex as re
|
|
|
|
| 639 |
import regex as re
|
| 640 |
import pytesseract
|
| 641 |
try:
|
| 642 |
+
from spellchecker import SpellChecker
|
| 643 |
except ImportError:
|
| 644 |
try:
|
| 645 |
from pyspellchecker import SpellChecker
|
|
|
|
| 1098 |
zx = zxingcpp.read_barcodes(arr)
|
| 1099 |
for r in zx or []:
|
| 1100 |
x1=y1=w=h=0
|
| 1101 |
+
pos = getattr(r, "position", None)
|
| 1102 |
pts=[]
|
| 1103 |
+
if pos is not None:
|
| 1104 |
+
try:
|
| 1105 |
pts=list(pos)
|
| 1106 |
+
except TypeError:
|
| 1107 |
for name in ("top_left","topLeft","top_right","topRight","bottom_left","bottomLeft","bottom_right","bottomRight",
|
| 1108 |
"point1","point2","point3","point4"):
|
| 1109 |
+
if hasattr(pos, name):
|
| 1110 |
p=getattr(pos,name)
|
| 1111 |
if hasattr(p,"x") and hasattr(p,"y"):
|
| 1112 |
+
pts.append(p)
|
| 1113 |
+
if pts:
|
| 1114 |
xs=[int(getattr(p,"x",0)) for p in pts]; ys=[int(getattr(p,"y",0)) for p in pts]
|
| 1115 |
x1, x2 = min(xs), max(xs); y1, y2 = min(ys), max(ys); w, h = x2-x1, y2-y1
|
| 1116 |
results.append({
|
|
|
|
| 1135 |
out=[]
|
| 1136 |
for d in res:
|
| 1137 |
data = d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)
|
| 1138 |
+
out.append({
|
| 1139 |
"type": d.type, "data": data,
|
| 1140 |
"left": d.rect.left, "top": d.rect.top,
|
| 1141 |
"width": d.rect.width, "height": d.rect.height
|
| 1142 |
+
})
|
| 1143 |
+
return out
|
| 1144 |
except Exception:
|
| 1145 |
return []
|
| 1146 |
|
|
|
|
| 1307 |
return out
|
| 1308 |
|
| 1309 |
# -------------------- Gradio Interface -----------------
|
| 1310 |
+
def _contains_50_carroll(pdf_path: str) -> bool:
|
| 1311 |
+
"""Check if PDF contains the text '50 carroll' (case insensitive)"""
|
| 1312 |
+
try:
|
| 1313 |
+
if not HAS_PYMUPDF:
|
| 1314 |
+
return True # Skip validation if PyMuPDF not available
|
| 1315 |
+
|
| 1316 |
+
doc = fitz.open(pdf_path)
|
| 1317 |
+
for page_num in range(min(len(doc), 5)): # Check first 5 pages
|
| 1318 |
+
page = doc[page_num]
|
| 1319 |
+
text = page.get_text().lower()
|
| 1320 |
+
if "50 carroll" in text:
|
| 1321 |
+
doc.close()
|
| 1322 |
+
return True
|
| 1323 |
+
doc.close()
|
| 1324 |
+
return False
|
| 1325 |
+
except Exception:
|
| 1326 |
+
return True # Skip validation on error
|
| 1327 |
+
|
| 1328 |
def compare_pdfs(file_a, file_b):
|
| 1329 |
"""Main comparison function for Gradio interface"""
|
| 1330 |
try:
|
| 1331 |
if file_a is None or file_b is None:
|
| 1332 |
return None, None, None, "❌ Please upload both PDF files to compare", [], []
|
| 1333 |
|
| 1334 |
+
# Check for "50 carroll" text in both files
|
| 1335 |
+
if not _contains_50_carroll(file_a.name) or not _contains_50_carroll(file_b.name):
|
| 1336 |
+
return None, None, None, "❌ Invalid File - Required text '50 carroll' not found", [], []
|
| 1337 |
+
|
| 1338 |
# Load images with multiple pages support
|
| 1339 |
pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
|
| 1340 |
pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)
|