hkai20000 commited on
Commit
92a1e4b
·
verified ·
1 Parent(s): e3a5f01

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +28 -12
main.py CHANGED
@@ -198,35 +198,51 @@ def extract_text_structured(result) -> str:
198
  """
199
  Extract text from docTR result preserving logical structure.
200
  Groups text blocks by vertical position for better table handling.
 
201
  """
202
- all_words = []
203
 
204
  for page in result.pages:
205
  for block in page.blocks:
206
  for line in block.lines:
207
- line_text = ""
208
- min_y = float('inf')
209
-
210
  for word in line.words:
211
- line_text += word.value + " "
212
- min_y = min(min_y, word.geometry[0][1])
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
  if line_text.strip():
215
- all_words.append({
 
 
216
  'text': line_text.strip(),
217
  'y': min_y,
218
- 'x': line.geometry[0][0] if hasattr(line, 'geometry') else 0
219
  })
220
 
221
- all_words.sort(key=lambda w: (round(w['y'] * 20) / 20, w['x']))
 
222
 
 
223
  result_text = ""
224
  prev_y = -1
225
- for word_info in all_words:
226
- current_y_group = round(word_info['y'] * 20) / 20
227
  if prev_y != -1 and current_y_group != prev_y:
228
  result_text += "\n"
229
- result_text += word_info['text'] + " "
230
  prev_y = current_y_group
231
 
232
  return result_text.strip()
 
198
  """
199
  Extract text from docTR result preserving logical structure.
200
  Groups text blocks by vertical position for better table handling.
201
+ Sorts words within each line by x-position (left to right).
202
  """
203
+ all_lines = []
204
 
205
  for page in result.pages:
206
  for block in page.blocks:
207
  for line in block.lines:
208
+ # Collect all words with their positions
209
+ words_in_line = []
 
210
  for word in line.words:
211
+ words_in_line.append({
212
+ 'text': word.value,
213
+ 'x': word.geometry[0][0], # x position (left edge)
214
+ 'y': word.geometry[0][1] # y position (top edge)
215
+ })
216
+
217
+ if not words_in_line:
218
+ continue
219
+
220
+ # Sort words by x position (left to right)
221
+ words_in_line.sort(key=lambda w: w['x'])
222
+
223
+ # Build line text from sorted words
224
+ line_text = " ".join([w['text'] for w in words_in_line])
225
 
226
  if line_text.strip():
227
+ min_y = min(w['y'] for w in words_in_line)
228
+ min_x = min(w['x'] for w in words_in_line)
229
+ all_lines.append({
230
  'text': line_text.strip(),
231
  'y': min_y,
232
+ 'x': min_x
233
  })
234
 
235
+ # Sort lines by y position (top to bottom), then x (left to right for same row)
236
+ all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
237
 
238
+ # Build final text with line breaks between different y-groups
239
  result_text = ""
240
  prev_y = -1
241
+ for line_info in all_lines:
242
+ current_y_group = round(line_info['y'] * 20) / 20
243
  if prev_y != -1 and current_y_group != prev_y:
244
  result_text += "\n"
245
+ result_text += line_info['text'] + " "
246
  prev_y = current_y_group
247
 
248
  return result_text.strip()