Alfonso Velasco commited on
Commit
330c438
·
1 Parent(s): 259596e
Files changed (1) hide show
  1. app.py +82 -67
app.py CHANGED
@@ -208,104 +208,92 @@ def should_split_page(rendered_width: int, rendered_height: int,
208
  # For rotated pages (90 or 270), the page has already been rotated in the rendered image
209
  # So we just check the rendered dimensions directly
210
 
211
- aspect_ratio = rendered_width / rendered_height if rendered_height > 0 else 1
212
-
213
- # Don't split if page is portrait or square-ish
214
- if aspect_ratio <= 1.3:
215
- return False, "none"
216
-
217
- # Check if page is too wide
218
  if rendered_width > max_width:
219
- # For very wide pages (like 2-page spreads), split horizontally
220
- if aspect_ratio > 1.8:
221
- return True, "horizontal"
222
- # For moderately wide pages, try to fit
223
- else:
224
- return True, "horizontal"
225
 
226
- return False, "none"
227
 
228
- def split_image_intelligently(image: Image.Image, max_width: int, overlap_ratio: float = 0.15) -> List[Tuple[Image.Image, int]]:
 
229
  """
230
- Split an image into overlapping chunks intelligently.
231
- Returns list of (chunk_image, x_offset) tuples.
 
 
 
232
  """
233
  img_width, img_height = image.size
234
- chunks = []
 
 
235
 
236
  # Calculate overlap in pixels
237
  overlap_pixels = int(max_width * overlap_ratio)
238
-
239
- # Calculate effective step size
240
  step_size = max_width - overlap_pixels
241
 
242
- if step_size <= 0:
243
- step_size = max_width // 2
244
-
245
- # Calculate number of chunks needed
246
- num_chunks = math.ceil((img_width - overlap_pixels) / step_size)
247
-
248
- # If we'd only need 2 chunks and the second would be very small, just use 2 equal chunks
249
- if num_chunks == 2:
250
- second_chunk_width = img_width - step_size
251
- if second_chunk_width < max_width * 0.6: # If second chunk would be less than 60% of max
252
- # Split into two equal chunks with overlap
253
- chunk_width = (img_width + overlap_pixels) // 2
254
- chunks.append((image.crop((0, 0, chunk_width, img_height)), 0))
255
- chunks.append((image.crop((img_width - chunk_width, 0, img_width, img_height)),
256
- img_width - chunk_width))
257
- return chunks
258
 
259
- # Standard overlapping chunks
260
- for i in range(num_chunks):
261
- start_x = i * step_size
262
- end_x = min(start_x + max_width, img_width)
263
 
264
- # Ensure we don't create tiny slivers
265
- if end_x - start_x < max_width * 0.3: # Skip if less than 30% of max width
266
- continue
267
-
268
- chunk = image.crop((start_x, 0, end_x, img_height))
269
- chunks.append((chunk, start_x))
 
270
 
271
- print(f" Chunk {i+1}/{num_chunks}: x={start_x}-{end_x} (width={end_x-start_x})")
 
 
 
 
 
 
 
272
 
273
  return chunks
274
 
275
- def process_pdf(pdf_bytes, split_wide: bool = True):
276
- """Process PDF document, optionally splitting wide pages into chunks"""
 
 
 
 
 
 
277
  all_results = []
278
 
279
- with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
280
  tmp_file.write(pdf_bytes)
281
  tmp_file.flush()
282
 
283
- try:
284
- pdf_document = fitz.open(tmp_file.name)
285
- except Exception as e:
286
- os.unlink(tmp_file.name)
287
- raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
288
-
289
- # Configuration
290
- RENDER_SCALE = 2.0
291
- MAX_WIDTH = 2000 # Increased for better quality
292
- MAX_TOKENS = 768 # Increased token limit for complex documents
293
 
294
  for page_num in range(len(pdf_document)):
295
  try:
296
  page = pdf_document[page_num]
297
- page_rect = page.rect
298
 
299
- # Original page dimensions before any rotation
300
- original_width = page_rect.width
301
- original_height = page_rect.height
 
302
  original_rotation = page.rotation
303
 
304
- print(f"\nPage {page_num + 1}:")
305
  print(f" Original dimensions: {original_width}x{original_height}")
306
  print(f" Rotation: {original_rotation}°")
307
 
308
- # Render page - PyMuPDF automatically applies rotation
 
309
  mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
310
  pix = page.get_pixmap(matrix=mat)
311
  img_data = pix.tobytes("png")
@@ -342,17 +330,33 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
342
  chunk_width, chunk_height = chunk_image.size
343
  print(f" Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
344
 
 
 
 
 
 
345
  # Process chunk with increased token limit
346
  chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
347
- print(f" Extracted {len(chunk_results)} items")
 
 
 
 
 
 
348
 
349
  # Transform chunk-relative coordinates to full page coordinates
350
  for result in chunk_results:
351
  bbox = result['bbox']
352
 
353
  # Add chunk offset (in rendered image pixels)
 
354
  bbox['x'] += x_offset
355
 
 
 
 
 
356
  # Scale from rendered image pixels to PDF points
357
  # Use effective dimensions for proper scaling
358
  bbox['x'] = bbox['x'] / RENDER_SCALE
@@ -362,6 +366,8 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
362
 
363
  page_results.extend(chunk_results)
364
 
 
 
365
  else:
366
  # Process full page without splitting
367
  print(" Processing full page without splitting...")
@@ -382,6 +388,11 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
382
  unique_results = deduplicate_results(page_results)
383
  print(f" After deduplication: {len(unique_results)} unique items")
384
 
 
 
 
 
 
385
  # Return results with both original and effective dimensions
386
  all_results.append({
387
  "page": page_num + 1,
@@ -394,6 +405,10 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
394
  "height": effective_pdf_height
395
  },
396
  "rotation": original_rotation,
 
 
 
 
397
  "extractions": unique_results
398
  })
399
 
 
208
  # For rotated pages (90 or 270), the page has already been rotated in the rendered image
209
  # So we just check the rendered dimensions directly
210
 
211
+ # Check if width exceeds max_width
 
 
 
 
 
 
212
  if rendered_width > max_width:
213
+ return (True, "horizontal")
214
+
215
+ # Could add vertical splitting logic here if needed
216
+ # if rendered_height > max_height:
217
+ # return (True, "vertical")
 
218
 
219
+ return (False, None)
220
 
221
+ def split_image_intelligently(image: Image.Image, max_width: int,
222
+ overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
223
  """
224
+ Split image into overlapping chunks.
225
+
226
+ Returns:
227
+ List of (chunk_image, x_offset) tuples where x_offset is the pixel position
228
+ in the original image where this chunk starts.
229
  """
230
  img_width, img_height = image.size
231
+
232
+ if img_width <= max_width:
233
+ return [(image, 0)]
234
 
235
  # Calculate overlap in pixels
236
  overlap_pixels = int(max_width * overlap_ratio)
 
 
237
  step_size = max_width - overlap_pixels
238
 
239
+ chunks = []
240
+ x_position = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
+ while x_position < img_width:
243
+ # Calculate the right edge of this chunk
244
+ right_edge = min(x_position + max_width, img_width)
 
245
 
246
+ # If this would be a very small last chunk, extend the previous chunk instead
247
+ if right_edge < img_width and (img_width - right_edge) < (max_width * 0.3):
248
+ right_edge = img_width
249
+
250
+ # Crop the chunk
251
+ chunk = image.crop((x_position, 0, right_edge, img_height))
252
+ chunks.append((chunk, x_position))
253
 
254
+ print(f" Created chunk at x={x_position}, width={right_edge - x_position}")
255
+
256
+ # If we've reached the end, break
257
+ if right_edge >= img_width:
258
+ break
259
+
260
+ # Move to next chunk position
261
+ x_position += step_size
262
 
263
  return chunks
264
 
265
+ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
266
+ """
267
+ Process PDF and extract structured content.
268
+ """
269
+ RENDER_SCALE = 3.0 # High resolution for better OCR
270
+ MAX_WIDTH = 2000 # Maximum width for a single chunk (in rendered pixels)
271
+ MAX_TOKENS = 768 # Increased from 512 for better coverage
272
+
273
  all_results = []
274
 
275
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
276
  tmp_file.write(pdf_bytes)
277
  tmp_file.flush()
278
 
279
+ pdf_document = fitz.open(tmp_file.name)
 
 
 
 
 
 
 
 
 
280
 
281
  for page_num in range(len(pdf_document)):
282
  try:
283
  page = pdf_document[page_num]
 
284
 
285
+ # Get original page dimensions and rotation
286
+ original_rect = page.rect
287
+ original_width = original_rect.width
288
+ original_height = original_rect.height
289
  original_rotation = page.rotation
290
 
291
+ print(f"\nProcessing page {page_num + 1}:")
292
  print(f" Original dimensions: {original_width}x{original_height}")
293
  print(f" Rotation: {original_rotation}°")
294
 
295
+ # Render page at high resolution
296
+ # PyMuPDF automatically handles rotation when rendering
297
  mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
298
  pix = page.get_pixmap(matrix=mat)
299
  img_data = pix.tobytes("png")
 
330
  chunk_width, chunk_height = chunk_image.size
331
  print(f" Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
332
 
333
+ # IMPORTANT: Save chunk for debugging
334
+ if chunk_idx == 0:
335
+ print(f" DEBUG: Saving first chunk for inspection")
336
+ # chunk_image.save(f"/tmp/debug_chunk_{page_num}_{chunk_idx}.png")
337
+
338
  # Process chunk with increased token limit
339
  chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
340
+ print(f" Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
341
+
342
+ # DEBUG: Print sample of first few items
343
+ if chunk_results:
344
+ print(f" Sample items from chunk {chunk_idx + 1}:")
345
+ for i, item in enumerate(chunk_results[:3]):
346
+ print(f" Item {i+1}: text='{item['text']}', x={item['bbox']['x']:.1f}")
347
 
348
  # Transform chunk-relative coordinates to full page coordinates
349
  for result in chunk_results:
350
  bbox = result['bbox']
351
 
352
  # Add chunk offset (in rendered image pixels)
353
+ original_chunk_x = bbox['x']
354
  bbox['x'] += x_offset
355
 
356
+ # DEBUG: Print transformation for first item in each chunk
357
+ if result == chunk_results[0]:
358
+ print(f" Coordinate transform: chunk_x={original_chunk_x:.1f} + offset={x_offset} = page_x={bbox['x']:.1f}")
359
+
360
  # Scale from rendered image pixels to PDF points
361
  # Use effective dimensions for proper scaling
362
  bbox['x'] = bbox['x'] / RENDER_SCALE
 
366
 
367
  page_results.extend(chunk_results)
368
 
369
+ print(f" Total items before deduplication: {len(page_results)}")
370
+
371
  else:
372
  # Process full page without splitting
373
  print(" Processing full page without splitting...")
 
388
  unique_results = deduplicate_results(page_results)
389
  print(f" After deduplication: {len(unique_results)} unique items")
390
 
391
+ # DEBUG: Print x-coordinate range of results
392
+ if unique_results:
393
+ x_coords = [item['bbox']['x'] for item in unique_results]
394
+ print(f" X-coordinate range: {min(x_coords):.1f} to {max(x_coords):.1f}")
395
+
396
  # Return results with both original and effective dimensions
397
  all_results.append({
398
  "page": page_num + 1,
 
405
  "height": effective_pdf_height
406
  },
407
  "rotation": original_rotation,
408
+ "rendered_dimensions": {
409
+ "width": rendered_width,
410
+ "height": rendered_height
411
+ },
412
  "extractions": unique_results
413
  })
414