Update main.py
Browse files
main.py
CHANGED
|
@@ -99,9 +99,9 @@ def get_ocr_predictor(det_arch: str, reco_arch: str):
|
|
| 99 |
det_arch=det_arch,
|
| 100 |
reco_arch=reco_arch,
|
| 101 |
pretrained=True,
|
| 102 |
-
assume_straight_pages=
|
| 103 |
-
straighten_pages=
|
| 104 |
-
detect_orientation=
|
| 105 |
preserve_aspect_ratio=True # Keep proportions
|
| 106 |
)
|
| 107 |
ocr_model_cache[cache_key] = predictor
|
|
@@ -197,55 +197,66 @@ def basic_cleanup(text: str) -> str:
|
|
| 197 |
def extract_text_structured(result) -> str:
|
| 198 |
"""
|
| 199 |
Extract text from docTR result preserving logical structure.
|
| 200 |
-
|
| 201 |
-
Sorts words within each line by x-position (left to right).
|
| 202 |
"""
|
| 203 |
all_lines = []
|
| 204 |
|
| 205 |
for page in result.pages:
|
| 206 |
for block in page.blocks:
|
| 207 |
for line in block.lines:
|
| 208 |
-
# Collect
|
| 209 |
-
|
| 210 |
for word in line.words:
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
| 212 |
'text': word.value,
|
| 213 |
-
'x':
|
| 214 |
-
'y':
|
| 215 |
})
|
| 216 |
|
| 217 |
-
if not
|
| 218 |
continue
|
| 219 |
|
| 220 |
# Sort words by x position (left to right)
|
| 221 |
-
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
|
|
|
| 225 |
|
| 226 |
if line_text.strip():
|
| 227 |
-
min_y = min(w['y'] for w in words_in_line)
|
| 228 |
-
min_x = min(w['x'] for w in words_in_line)
|
| 229 |
all_lines.append({
|
| 230 |
'text': line_text.strip(),
|
| 231 |
-
'y':
|
| 232 |
'x': min_x
|
| 233 |
})
|
| 234 |
|
| 235 |
-
# Sort lines by y position (top to bottom)
|
| 236 |
all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
|
| 237 |
|
| 238 |
-
#
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
| 241 |
for line_info in all_lines:
|
| 242 |
current_y_group = round(line_info['y'] * 20) / 20
|
| 243 |
-
if prev_y != -1 and current_y_group != prev_y:
|
| 244 |
-
result_text += "\n"
|
| 245 |
-
result_text += line_info['text'] + " "
|
| 246 |
-
prev_y = current_y_group
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
def extract_words_with_boxes(result) -> list:
|
| 251 |
"""
|
|
|
|
| 99 |
det_arch=det_arch,
|
| 100 |
reco_arch=reco_arch,
|
| 101 |
pretrained=True,
|
| 102 |
+
assume_straight_pages=True, # Assume pages are already straight
|
| 103 |
+
straighten_pages=False, # Don't auto-rotate (was causing issues)
|
| 104 |
+
detect_orientation=False, # Don't detect orientation (was inverting text)
|
| 105 |
preserve_aspect_ratio=True # Keep proportions
|
| 106 |
)
|
| 107 |
ocr_model_cache[cache_key] = predictor
|
|
|
|
| 197 |
def extract_text_structured(result) -> str:
|
| 198 |
"""
|
| 199 |
Extract text from docTR result preserving logical structure.
|
| 200 |
+
Explicitly sorts words by x-coordinate and lines by y-coordinate.
|
|
|
|
| 201 |
"""
|
| 202 |
all_lines = []
|
| 203 |
|
| 204 |
for page in result.pages:
|
| 205 |
for block in page.blocks:
|
| 206 |
for line in block.lines:
|
| 207 |
+
# Collect words with their x-positions
|
| 208 |
+
words_data = []
|
| 209 |
for word in line.words:
|
| 210 |
+
# geometry is ((x_min, y_min), (x_max, y_max))
|
| 211 |
+
x_pos = word.geometry[0][0] # Left edge x-coordinate
|
| 212 |
+
y_pos = word.geometry[0][1] # Top edge y-coordinate
|
| 213 |
+
words_data.append({
|
| 214 |
'text': word.value,
|
| 215 |
+
'x': x_pos,
|
| 216 |
+
'y': y_pos
|
| 217 |
})
|
| 218 |
|
| 219 |
+
if not words_data:
|
| 220 |
continue
|
| 221 |
|
| 222 |
# Sort words by x position (left to right)
|
| 223 |
+
words_data.sort(key=lambda w: w['x'])
|
| 224 |
|
| 225 |
+
line_text = " ".join([w['text'] for w in words_data])
|
| 226 |
+
avg_y = sum(w['y'] for w in words_data) / len(words_data)
|
| 227 |
+
min_x = min(w['x'] for w in words_data)
|
| 228 |
|
| 229 |
if line_text.strip():
|
|
|
|
|
|
|
| 230 |
all_lines.append({
|
| 231 |
'text': line_text.strip(),
|
| 232 |
+
'y': avg_y,
|
| 233 |
'x': min_x
|
| 234 |
})
|
| 235 |
|
| 236 |
+
# Sort lines by y position (top to bottom)
|
| 237 |
all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
|
| 238 |
|
| 239 |
+
# Group lines by y-position and build final text
|
| 240 |
+
result_lines = []
|
| 241 |
+
prev_y_group = -1
|
| 242 |
+
current_line_parts = []
|
| 243 |
+
|
| 244 |
for line_info in all_lines:
|
| 245 |
current_y_group = round(line_info['y'] * 20) / 20
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
+
if prev_y_group != -1 and current_y_group != prev_y_group:
|
| 248 |
+
if current_line_parts:
|
| 249 |
+
result_lines.append(" ".join(current_line_parts))
|
| 250 |
+
current_line_parts = [line_info['text']]
|
| 251 |
+
else:
|
| 252 |
+
current_line_parts.append(line_info['text'])
|
| 253 |
+
|
| 254 |
+
prev_y_group = current_y_group
|
| 255 |
+
|
| 256 |
+
if current_line_parts:
|
| 257 |
+
result_lines.append(" ".join(current_line_parts))
|
| 258 |
+
|
| 259 |
+
return "\n".join(result_lines)
|
| 260 |
|
| 261 |
def extract_words_with_boxes(result) -> list:
|
| 262 |
"""
|