hkai20000 commited on
Commit
3f371b2
·
verified ·
1 Parent(s): 92a1e4b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +37 -26
main.py CHANGED
@@ -99,9 +99,9 @@ def get_ocr_predictor(det_arch: str, reco_arch: str):
99
  det_arch=det_arch,
100
  reco_arch=reco_arch,
101
  pretrained=True,
102
- assume_straight_pages=False, # Better for complex layouts
103
- straighten_pages=True, # Auto-correct rotation
104
- detect_orientation=True, # Detect page orientation
105
  preserve_aspect_ratio=True # Keep proportions
106
  )
107
  ocr_model_cache[cache_key] = predictor
@@ -197,55 +197,66 @@ def basic_cleanup(text: str) -> str:
197
  def extract_text_structured(result) -> str:
198
  """
199
  Extract text from docTR result preserving logical structure.
200
- Groups text blocks by vertical position for better table handling.
201
- Sorts words within each line by x-position (left to right).
202
  """
203
  all_lines = []
204
 
205
  for page in result.pages:
206
  for block in page.blocks:
207
  for line in block.lines:
208
- # Collect all words with their positions
209
- words_in_line = []
210
  for word in line.words:
211
- words_in_line.append({
 
 
 
212
  'text': word.value,
213
- 'x': word.geometry[0][0], # x position (left edge)
214
- 'y': word.geometry[0][1] # y position (top edge)
215
  })
216
 
217
- if not words_in_line:
218
  continue
219
 
220
  # Sort words by x position (left to right)
221
- words_in_line.sort(key=lambda w: w['x'])
222
 
223
- # Build line text from sorted words
224
- line_text = " ".join([w['text'] for w in words_in_line])
 
225
 
226
  if line_text.strip():
227
- min_y = min(w['y'] for w in words_in_line)
228
- min_x = min(w['x'] for w in words_in_line)
229
  all_lines.append({
230
  'text': line_text.strip(),
231
- 'y': min_y,
232
  'x': min_x
233
  })
234
 
235
- # Sort lines by y position (top to bottom), then x (left to right for same row)
236
  all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
237
 
238
- # Build final text with line breaks between different y-groups
239
- result_text = ""
240
- prev_y = -1
 
 
241
  for line_info in all_lines:
242
  current_y_group = round(line_info['y'] * 20) / 20
243
- if prev_y != -1 and current_y_group != prev_y:
244
- result_text += "\n"
245
- result_text += line_info['text'] + " "
246
- prev_y = current_y_group
247
 
248
- return result_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  def extract_words_with_boxes(result) -> list:
251
  """
 
99
  det_arch=det_arch,
100
  reco_arch=reco_arch,
101
  pretrained=True,
102
+ assume_straight_pages=True, # Assume pages are already straight
103
+ straighten_pages=False, # Don't auto-rotate (was causing issues)
104
+ detect_orientation=False, # Don't detect orientation (was inverting text)
105
  preserve_aspect_ratio=True # Keep proportions
106
  )
107
  ocr_model_cache[cache_key] = predictor
 
197
  def extract_text_structured(result) -> str:
198
  """
199
  Extract text from docTR result preserving logical structure.
200
+ Explicitly sorts words by x-coordinate and lines by y-coordinate.
 
201
  """
202
  all_lines = []
203
 
204
  for page in result.pages:
205
  for block in page.blocks:
206
  for line in block.lines:
207
+ # Collect words with their x-positions
208
+ words_data = []
209
  for word in line.words:
210
+ # geometry is ((x_min, y_min), (x_max, y_max))
211
+ x_pos = word.geometry[0][0] # Left edge x-coordinate
212
+ y_pos = word.geometry[0][1] # Top edge y-coordinate
213
+ words_data.append({
214
  'text': word.value,
215
+ 'x': x_pos,
216
+ 'y': y_pos
217
  })
218
 
219
+ if not words_data:
220
  continue
221
 
222
  # Sort words by x position (left to right)
223
+ words_data.sort(key=lambda w: w['x'])
224
 
225
+ line_text = " ".join([w['text'] for w in words_data])
226
+ avg_y = sum(w['y'] for w in words_data) / len(words_data)
227
+ min_x = min(w['x'] for w in words_data)
228
 
229
  if line_text.strip():
 
 
230
  all_lines.append({
231
  'text': line_text.strip(),
232
+ 'y': avg_y,
233
  'x': min_x
234
  })
235
 
236
+ # Sort lines by y position (top to bottom)
237
  all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
238
 
239
+ # Group lines by y-position and build final text
240
+ result_lines = []
241
+ prev_y_group = -1
242
+ current_line_parts = []
243
+
244
  for line_info in all_lines:
245
  current_y_group = round(line_info['y'] * 20) / 20
 
 
 
 
246
 
247
+ if prev_y_group != -1 and current_y_group != prev_y_group:
248
+ if current_line_parts:
249
+ result_lines.append(" ".join(current_line_parts))
250
+ current_line_parts = [line_info['text']]
251
+ else:
252
+ current_line_parts.append(line_info['text'])
253
+
254
+ prev_y_group = current_y_group
255
+
256
+ if current_line_parts:
257
+ result_lines.append(" ".join(current_line_parts))
258
+
259
+ return "\n".join(result_lines)
260
 
261
  def extract_words_with_boxes(result) -> list:
262
  """