Alfonso Velasco commited on
Commit
c9e5fd6
·
1 Parent(s): 179cb76

fix chunk

Browse files
Files changed (1) hide show
  1. app.py +211 -135
app.py CHANGED
@@ -68,6 +68,9 @@ async def extract_document(request: DocumentRequest):
68
  return process_image(file_bytes)
69
 
70
  except Exception as e:
 
 
 
71
  raise HTTPException(status_code=500, detail=str(e))
72
 
73
  def process_image_chunk(image: Image.Image) -> List[Dict]:
@@ -77,25 +80,34 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
77
  """
78
  img_width, img_height = image.size
79
 
 
 
 
 
 
80
  try:
81
  encoding = processor(
82
  image,
83
  truncation=True,
84
  padding="max_length",
85
- max_length=1024,
86
  return_tensors="pt"
87
  )
88
  except Exception as e:
89
  print(f"OCR failed: {e}, using fallback")
90
- encoding = processor(
91
- image,
92
- text=[""] * 1024,
93
- boxes=[[0, 0, 0, 0]] * 1024,
94
- truncation=True,
95
- padding="max_length",
96
- max_length=1024,
97
- return_tensors="pt"
98
- )
 
 
 
 
99
 
100
  # Move to device and ensure bbox is clamped to valid range
101
  encoding_device = {}
@@ -116,59 +128,73 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
116
  print(f"CUDA error encountered: {e}")
117
  print("Falling back to CPU...")
118
  # Move everything to CPU
119
- encoding = {k: v.cpu() for k, v in encoding.items()}
120
  model.cpu()
121
  with torch.no_grad():
122
  outputs = model(**encoding)
123
  # Move model back to original device
124
  model.to(device)
 
 
 
125
  else:
126
  raise
 
 
 
127
 
128
- tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
129
- boxes = encoding["bbox"][0].tolist()
 
 
 
 
130
 
131
  results = []
132
  processed_boxes = set()
133
 
134
- for token, box in zip(tokens, boxes):
135
- if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
136
- x_norm = box[0]
137
- y_norm = box[1]
138
- x2_norm = box[2]
139
- y2_norm = box[3]
140
-
141
- if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
142
- continue
143
-
144
- # Convert normalized coordinates to chunk pixel coordinates
145
- x = (x_norm / 1000.0) * img_width
146
- y = (y_norm / 1000.0) * img_height
147
- x2 = (x2_norm / 1000.0) * img_width
148
- y2 = (y2_norm / 1000.0) * img_height
149
-
150
- width = x2 - x
151
- height = y2 - y
152
-
153
- if width < 1 or height < 1:
154
- continue
155
-
156
- box_tuple = (round(x), round(y), round(width), round(height))
157
- if box_tuple in processed_boxes:
158
- continue
159
- processed_boxes.add(box_tuple)
160
-
161
- clean_token = token.replace('##', '')
162
-
163
- results.append({
164
- "text": clean_token,
165
- "bbox": {
166
- "x": x,
167
- "y": y,
168
- "width": width,
169
- "height": height
170
- }
171
- })
 
 
 
 
172
 
173
  return results
174
 
@@ -180,112 +206,162 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
180
  tmp_file.write(pdf_bytes)
181
  tmp_file.flush()
182
 
183
- pdf_document = fitz.open(tmp_file.name)
 
 
 
 
184
 
185
  RENDER_SCALE = 2.0
186
- MAX_WIDTH = 2000 # Maximum width before splitting (in pixels after rendering)
187
- OVERLAP = 200 # Overlap between chunks to avoid missing text at boundaries
188
 
189
  for page_num in range(len(pdf_document)):
190
- page = pdf_document[page_num]
191
- page_rect = page.rect
192
- page_width = page_rect.width
193
- page_height = page_rect.height
194
-
195
- print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
196
-
197
- # Render page
198
- mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
199
- pix = page.get_pixmap(matrix=mat)
200
- img_data = pix.tobytes("png")
201
- full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
202
- img_width, img_height = full_image.size
203
-
204
- print(f"Rendered image: {img_width}x{img_height}")
205
-
206
- page_results = []
207
-
208
- # Check if page is too wide and should be split
209
- if split_wide and img_width > MAX_WIDTH:
210
- print(f"Page is wide ({img_width}px), splitting into chunks...")
211
 
212
- num_chunks = (img_width + MAX_WIDTH - OVERLAP - 1) // (MAX_WIDTH - OVERLAP)
213
- chunk_width = MAX_WIDTH
214
 
215
- for chunk_idx in range(num_chunks):
216
- # Calculate chunk boundaries in rendered image pixels
217
- start_x = chunk_idx * (chunk_width - OVERLAP)
218
- end_x = min(start_x + chunk_width, img_width)
219
 
220
- # Crop chunk from rendered image
221
- chunk = full_image.crop((start_x, 0, end_x, img_height))
 
 
222
 
223
- print(f" Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
224
 
225
- # Process chunk (returns coordinates relative to chunk)
226
- chunk_results = process_image_chunk(chunk)
227
 
228
- # Transform chunk-relative coordinates to full page coordinates
229
- for result in chunk_results:
230
- bbox = result['bbox']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
- # Add chunk offset (in rendered image pixels)
233
- bbox['x'] += start_x
234
- # y stays the same (no vertical splitting)
235
- # bbox['y'] is already correct
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- # Now scale from rendered image pixels to PDF points
 
 
 
 
 
 
 
 
 
238
  bbox['x'] = bbox['x'] / RENDER_SCALE
239
  bbox['y'] = bbox['y'] / RENDER_SCALE
240
  bbox['width'] = bbox['width'] / RENDER_SCALE
241
  bbox['height'] = bbox['height'] / RENDER_SCALE
242
 
243
- page_results.extend(chunk_results)
244
-
245
- print(f" Total extractions from all chunks: {len(page_results)}")
 
 
246
 
247
- else:
248
- # Process full page (no splitting needed)
249
- chunk_results = process_image_chunk(full_image)
250
 
251
- # Scale coordinates from rendered image pixels to PDF points
252
- for result in chunk_results:
253
  bbox = result['bbox']
254
- bbox['x'] = bbox['x'] / RENDER_SCALE
255
- bbox['y'] = bbox['y'] / RENDER_SCALE
256
- bbox['width'] = bbox['width'] / RENDER_SCALE
257
- bbox['height'] = bbox['height'] / RENDER_SCALE
 
 
 
 
 
 
258
 
259
- page_results = chunk_results
260
-
261
- # Remove duplicates from overlapping chunks
262
- unique_results = []
263
- seen_boxes = set()
264
-
265
- for result in page_results:
266
- bbox = result['bbox']
267
- box_tuple = (
268
- round(bbox['x']),
269
- round(bbox['y']),
270
- round(bbox['width']),
271
- round(bbox['height'])
272
- )
273
 
274
- if box_tuple not in seen_boxes:
275
- seen_boxes.add(box_tuple)
276
- unique_results.append(result)
277
-
278
- print(f" After deduplication: {len(unique_results)} unique extractions")
279
-
280
- all_results.append({
281
- "page": page_num + 1,
282
- "page_dimensions": {
283
- "width": page_width,
284
- "height": page_height
285
- },
286
- "rotation": page.rotation,
287
- "extractions": unique_results
288
- })
 
 
 
 
 
 
 
289
 
290
  pdf_document.close()
291
  os.unlink(tmp_file.name)
 
68
  return process_image(file_bytes)
69
 
70
  except Exception as e:
71
+ import traceback
72
+ error_details = traceback.format_exc()
73
+ print(f"Error in extract_document: {error_details}")
74
  raise HTTPException(status_code=500, detail=str(e))
75
 
76
  def process_image_chunk(image: Image.Image) -> List[Dict]:
 
80
  """
81
  img_width, img_height = image.size
82
 
83
+ # Validate image dimensions
84
+ if img_width < 1 or img_height < 1:
85
+ print(f"Invalid image dimensions: {img_width}x{img_height}")
86
+ return []
87
+
88
  try:
89
  encoding = processor(
90
  image,
91
  truncation=True,
92
  padding="max_length",
93
+ max_length=512, # Reduced from 1024 for better stability
94
  return_tensors="pt"
95
  )
96
  except Exception as e:
97
  print(f"OCR failed: {e}, using fallback")
98
+ try:
99
+ encoding = processor(
100
+ image,
101
+ text=[""] * 512,
102
+ boxes=[[0, 0, 0, 0]] * 512,
103
+ truncation=True,
104
+ padding="max_length",
105
+ max_length=512,
106
+ return_tensors="pt"
107
+ )
108
+ except Exception as e2:
109
+ print(f"Fallback also failed: {e2}")
110
+ return []
111
 
112
  # Move to device and ensure bbox is clamped to valid range
113
  encoding_device = {}
 
128
  print(f"CUDA error encountered: {e}")
129
  print("Falling back to CPU...")
130
  # Move everything to CPU
131
+ encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
132
  model.cpu()
133
  with torch.no_grad():
134
  outputs = model(**encoding)
135
  # Move model back to original device
136
  model.to(device)
137
+ elif "index out of range" in str(e):
138
+ print(f"Index error in model processing: {e}")
139
+ return []
140
  else:
141
  raise
142
+ except Exception as e:
143
+ print(f"Unexpected error in model processing: {e}")
144
+ return []
145
 
146
+ try:
147
+ tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
148
+ boxes = encoding["bbox"][0].tolist()
149
+ except Exception as e:
150
+ print(f"Error extracting tokens/boxes: {e}")
151
+ return []
152
 
153
  results = []
154
  processed_boxes = set()
155
 
156
+ for idx, (token, box) in enumerate(zip(tokens, boxes)):
157
+ try:
158
+ if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
159
+ x_norm = box[0]
160
+ y_norm = box[1]
161
+ x2_norm = box[2]
162
+ y2_norm = box[3]
163
+
164
+ if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
165
+ continue
166
+
167
+ # Convert normalized coordinates to chunk pixel coordinates
168
+ x = (x_norm / 1000.0) * img_width
169
+ y = (y_norm / 1000.0) * img_height
170
+ x2 = (x2_norm / 1000.0) * img_width
171
+ y2 = (y2_norm / 1000.0) * img_height
172
+
173
+ width = x2 - x
174
+ height = y2 - y
175
+
176
+ if width < 1 or height < 1:
177
+ continue
178
+
179
+ box_tuple = (round(x), round(y), round(width), round(height))
180
+ if box_tuple in processed_boxes:
181
+ continue
182
+ processed_boxes.add(box_tuple)
183
+
184
+ clean_token = token.replace('##', '')
185
+
186
+ results.append({
187
+ "text": clean_token,
188
+ "bbox": {
189
+ "x": x,
190
+ "y": y,
191
+ "width": width,
192
+ "height": height
193
+ }
194
+ })
195
+ except Exception as e:
196
+ print(f"Error processing token at index {idx}: {e}")
197
+ continue
198
 
199
  return results
200
 
 
206
  tmp_file.write(pdf_bytes)
207
  tmp_file.flush()
208
 
209
+ try:
210
+ pdf_document = fitz.open(tmp_file.name)
211
+ except Exception as e:
212
+ os.unlink(tmp_file.name)
213
+ raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
214
 
215
  RENDER_SCALE = 2.0
216
+ MAX_WIDTH = 1800 # Reduced from 2000 for better stability
217
+ OVERLAP = 150 # Reduced overlap
218
 
219
  for page_num in range(len(pdf_document)):
220
+ try:
221
+ page = pdf_document[page_num]
222
+ page_rect = page.rect
223
+ page_width = page_rect.width
224
+ page_height = page_rect.height
225
+
226
+ print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
227
+
228
+ # Render page
229
+ mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
230
+ pix = page.get_pixmap(matrix=mat)
231
+ img_data = pix.tobytes("png")
232
+ full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
233
+ img_width, img_height = full_image.size
234
+
235
+ print(f"Rendered image: {img_width}x{img_height}")
 
 
 
 
 
236
 
237
+ page_results = []
 
238
 
239
+ # Check if page is too wide and should be split
240
+ if split_wide and img_width > MAX_WIDTH:
241
+ print(f"Page is wide ({img_width}px), splitting into chunks...")
 
242
 
243
+ # Calculate proper number of chunks with safer logic
244
+ step_size = MAX_WIDTH - OVERLAP
245
+ if step_size <= 0:
246
+ step_size = MAX_WIDTH // 2 # Fallback
247
 
248
+ num_chunks = max(1, ((img_width - OVERLAP) + step_size - 1) // step_size)
249
 
250
+ print(f"Will create {num_chunks} chunks with step size {step_size}")
 
251
 
252
+ for chunk_idx in range(num_chunks):
253
+ # Calculate chunk boundaries in rendered image pixels
254
+ start_x = chunk_idx * step_size
255
+ end_x = min(start_x + MAX_WIDTH, img_width)
256
+
257
+ # Ensure chunk has valid dimensions
258
+ if end_x <= start_x:
259
+ print(f" Skipping invalid chunk {chunk_idx + 1}: start_x={start_x}, end_x={end_x}")
260
+ continue
261
+
262
+ chunk_actual_width = end_x - start_x
263
+
264
+ # Skip chunks that are too narrow
265
+ if chunk_actual_width < 100:
266
+ print(f" Skipping narrow chunk {chunk_idx + 1}: width={chunk_actual_width}")
267
+ continue
268
+
269
+ print(f" Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x} (width={chunk_actual_width})")
270
 
271
+ try:
272
+ # Crop chunk from rendered image
273
+ chunk = full_image.crop((start_x, 0, end_x, img_height))
274
+
275
+ # Verify chunk dimensions
276
+ verify_width, verify_height = chunk.size
277
+ print(f" Chunk actual size: {verify_width}x{verify_height}")
278
+
279
+ # Process chunk (returns coordinates relative to chunk)
280
+ chunk_results = process_image_chunk(chunk)
281
+ print(f" Extracted {len(chunk_results)} items from chunk")
282
+
283
+ # Transform chunk-relative coordinates to full page coordinates
284
+ for result in chunk_results:
285
+ bbox = result['bbox']
286
+
287
+ # Add chunk offset (in rendered image pixels)
288
+ bbox['x'] += start_x
289
+ # y stays the same (no vertical splitting)
290
+
291
+ # Now scale from rendered image pixels to PDF points
292
+ bbox['x'] = bbox['x'] / RENDER_SCALE
293
+ bbox['y'] = bbox['y'] / RENDER_SCALE
294
+ bbox['width'] = bbox['width'] / RENDER_SCALE
295
+ bbox['height'] = bbox['height'] / RENDER_SCALE
296
+
297
+ page_results.extend(chunk_results)
298
+
299
+ except Exception as e:
300
+ print(f" Error processing chunk {chunk_idx + 1}: {e}")
301
+ import traceback
302
+ traceback.print_exc()
303
+ continue
304
 
305
+ print(f" Total extractions from all chunks: {len(page_results)}")
306
+
307
+ else:
308
+ # Process full page (no splitting needed)
309
+ print("Processing full page without splitting")
310
+ chunk_results = process_image_chunk(full_image)
311
+
312
+ # Scale coordinates from rendered image pixels to PDF points
313
+ for result in chunk_results:
314
+ bbox = result['bbox']
315
  bbox['x'] = bbox['x'] / RENDER_SCALE
316
  bbox['y'] = bbox['y'] / RENDER_SCALE
317
  bbox['width'] = bbox['width'] / RENDER_SCALE
318
  bbox['height'] = bbox['height'] / RENDER_SCALE
319
 
320
+ page_results = chunk_results
321
+
322
+ # Remove duplicates from overlapping chunks
323
+ unique_results = []
324
+ seen_boxes = set()
325
 
326
+ DEDUP_TOLERANCE = 5 # pixels tolerance for deduplication
 
 
327
 
328
+ for result in page_results:
 
329
  bbox = result['bbox']
330
+ box_tuple = (
331
+ round(bbox['x'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
332
+ round(bbox['y'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
333
+ round(bbox['width'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
334
+ round(bbox['height'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE
335
+ )
336
+
337
+ if box_tuple not in seen_boxes:
338
+ seen_boxes.add(box_tuple)
339
+ unique_results.append(result)
340
 
341
+ print(f" After deduplication: {len(unique_results)} unique extractions")
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
+ all_results.append({
344
+ "page": page_num + 1,
345
+ "page_dimensions": {
346
+ "width": page_width,
347
+ "height": page_height
348
+ },
349
+ "rotation": page.rotation,
350
+ "extractions": unique_results
351
+ })
352
+
353
+ except Exception as e:
354
+ print(f"Error processing page {page_num + 1}: {e}")
355
+ import traceback
356
+ traceback.print_exc()
357
+ # Add empty page result to maintain page numbering
358
+ all_results.append({
359
+ "page": page_num + 1,
360
+ "page_dimensions": {"width": 0, "height": 0},
361
+ "rotation": 0,
362
+ "extractions": [],
363
+ "error": str(e)
364
+ })
365
 
366
  pdf_document.close()
367
  os.unlink(tmp_file.name)