Yaz Hobooti
commited on
Commit
·
2874826
1
Parent(s):
bae9f7f
Fix indentation error in find_misspell_boxes function
Browse files- pdf_comparator.py +8 -8
pdf_comparator.py
CHANGED
|
@@ -131,25 +131,25 @@ def find_misspell_boxes(img: Image.Image) -> List[Box]:
|
|
| 131 |
try:
|
| 132 |
spell = SpellChecker()
|
| 133 |
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
boxes: List[Box] = []
|
| 138 |
-
|
| 139 |
text = data["text"][i]
|
| 140 |
if not text:
|
| 141 |
-
|
| 142 |
token = normalize_token(text)
|
| 143 |
if len(token) < 2:
|
| 144 |
-
|
| 145 |
if token in spell:
|
| 146 |
-
|
| 147 |
left = data.get("left", [0])[i]
|
| 148 |
top = data.get("top", [0])[i]
|
| 149 |
width = data.get("width", [0])[i]
|
| 150 |
height= data.get("height",[0])[i]
|
| 151 |
if width <= 0 or height <= 0:
|
| 152 |
-
|
| 153 |
boxes.append(Box(top, left, top+height, left+width, width*height))
|
| 154 |
return boxes
|
| 155 |
|
|
|
|
| 131 |
try:
|
| 132 |
spell = SpellChecker()
|
| 133 |
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
| 134 |
+
except Exception:
|
| 135 |
+
return []
|
| 136 |
+
n = len(data.get("text", []))
|
| 137 |
boxes: List[Box] = []
|
| 138 |
+
for i in range(n):
|
| 139 |
text = data["text"][i]
|
| 140 |
if not text:
|
| 141 |
+
continue
|
| 142 |
token = normalize_token(text)
|
| 143 |
if len(token) < 2:
|
| 144 |
+
continue
|
| 145 |
if token in spell:
|
| 146 |
+
continue
|
| 147 |
left = data.get("left", [0])[i]
|
| 148 |
top = data.get("top", [0])[i]
|
| 149 |
width = data.get("width", [0])[i]
|
| 150 |
height= data.get("height",[0])[i]
|
| 151 |
if width <= 0 or height <= 0:
|
| 152 |
+
continue
|
| 153 |
boxes.append(Box(top, left, top+height, left+width, width*height))
|
| 154 |
return boxes
|
| 155 |
|