Update main.py
Browse files
main.py
CHANGED
|
@@ -198,35 +198,51 @@ def extract_text_structured(result) -> str:
|
|
| 198 |
"""
|
| 199 |
Extract text from docTR result preserving logical structure.
|
| 200 |
Groups text blocks by vertical position for better table handling.
|
|
|
|
| 201 |
"""
|
| 202 |
-
|
| 203 |
|
| 204 |
for page in result.pages:
|
| 205 |
for block in page.blocks:
|
| 206 |
for line in block.lines:
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
for word in line.words:
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
if line_text.strip():
|
| 215 |
-
|
|
|
|
|
|
|
| 216 |
'text': line_text.strip(),
|
| 217 |
'y': min_y,
|
| 218 |
-
'x':
|
| 219 |
})
|
| 220 |
|
| 221 |
-
|
|
|
|
| 222 |
|
|
|
|
| 223 |
result_text = ""
|
| 224 |
prev_y = -1
|
| 225 |
-
for
|
| 226 |
-
current_y_group = round(
|
| 227 |
if prev_y != -1 and current_y_group != prev_y:
|
| 228 |
result_text += "\n"
|
| 229 |
-
result_text +=
|
| 230 |
prev_y = current_y_group
|
| 231 |
|
| 232 |
return result_text.strip()
|
|
|
|
| 198 |
"""
|
| 199 |
Extract text from docTR result preserving logical structure.
|
| 200 |
Groups text blocks by vertical position for better table handling.
|
| 201 |
+
Sorts words within each line by x-position (left to right).
|
| 202 |
"""
|
| 203 |
+
all_lines = []
|
| 204 |
|
| 205 |
for page in result.pages:
|
| 206 |
for block in page.blocks:
|
| 207 |
for line in block.lines:
|
| 208 |
+
# Collect all words with their positions
|
| 209 |
+
words_in_line = []
|
|
|
|
| 210 |
for word in line.words:
|
| 211 |
+
words_in_line.append({
|
| 212 |
+
'text': word.value,
|
| 213 |
+
'x': word.geometry[0][0], # x position (left edge)
|
| 214 |
+
'y': word.geometry[0][1] # y position (top edge)
|
| 215 |
+
})
|
| 216 |
+
|
| 217 |
+
if not words_in_line:
|
| 218 |
+
continue
|
| 219 |
+
|
| 220 |
+
# Sort words by x position (left to right)
|
| 221 |
+
words_in_line.sort(key=lambda w: w['x'])
|
| 222 |
+
|
| 223 |
+
# Build line text from sorted words
|
| 224 |
+
line_text = " ".join([w['text'] for w in words_in_line])
|
| 225 |
|
| 226 |
if line_text.strip():
|
| 227 |
+
min_y = min(w['y'] for w in words_in_line)
|
| 228 |
+
min_x = min(w['x'] for w in words_in_line)
|
| 229 |
+
all_lines.append({
|
| 230 |
'text': line_text.strip(),
|
| 231 |
'y': min_y,
|
| 232 |
+
'x': min_x
|
| 233 |
})
|
| 234 |
|
| 235 |
+
# Sort lines by y position (top to bottom), then x (left to right for same row)
|
| 236 |
+
all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
|
| 237 |
|
| 238 |
+
# Build final text with line breaks between different y-groups
|
| 239 |
result_text = ""
|
| 240 |
prev_y = -1
|
| 241 |
+
for line_info in all_lines:
|
| 242 |
+
current_y_group = round(line_info['y'] * 20) / 20
|
| 243 |
if prev_y != -1 and current_y_group != prev_y:
|
| 244 |
result_text += "\n"
|
| 245 |
+
result_text += line_info['text'] + " "
|
| 246 |
prev_y = current_y_group
|
| 247 |
|
| 248 |
return result_text.strip()
|