Spaces:
Running
on
T4
Running
on
T4
Alfonso Velasco
commited on
Commit
·
330c438
1
Parent(s):
259596e
fix chunk
Browse files
app.py
CHANGED
|
@@ -208,104 +208,92 @@ def should_split_page(rendered_width: int, rendered_height: int,
|
|
| 208 |
# For rotated pages (90 or 270), the page has already been rotated in the rendered image
|
| 209 |
# So we just check the rendered dimensions directly
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
# Don't split if page is portrait or square-ish
|
| 214 |
-
if aspect_ratio <= 1.3:
|
| 215 |
-
return False, "none"
|
| 216 |
-
|
| 217 |
-
# Check if page is too wide
|
| 218 |
if rendered_width > max_width:
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
return True, "horizontal"
|
| 225 |
|
| 226 |
-
return False,
|
| 227 |
|
| 228 |
-
def split_image_intelligently(image: Image.Image, max_width: int,
|
|
|
|
| 229 |
"""
|
| 230 |
-
Split
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
| 232 |
"""
|
| 233 |
img_width, img_height = image.size
|
| 234 |
-
|
|
|
|
|
|
|
| 235 |
|
| 236 |
# Calculate overlap in pixels
|
| 237 |
overlap_pixels = int(max_width * overlap_ratio)
|
| 238 |
-
|
| 239 |
-
# Calculate effective step size
|
| 240 |
step_size = max_width - overlap_pixels
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
# Calculate number of chunks needed
|
| 246 |
-
num_chunks = math.ceil((img_width - overlap_pixels) / step_size)
|
| 247 |
-
|
| 248 |
-
# If we'd only need 2 chunks and the second would be very small, just use 2 equal chunks
|
| 249 |
-
if num_chunks == 2:
|
| 250 |
-
second_chunk_width = img_width - step_size
|
| 251 |
-
if second_chunk_width < max_width * 0.6: # If second chunk would be less than 60% of max
|
| 252 |
-
# Split into two equal chunks with overlap
|
| 253 |
-
chunk_width = (img_width + overlap_pixels) // 2
|
| 254 |
-
chunks.append((image.crop((0, 0, chunk_width, img_height)), 0))
|
| 255 |
-
chunks.append((image.crop((img_width - chunk_width, 0, img_width, img_height)),
|
| 256 |
-
img_width - chunk_width))
|
| 257 |
-
return chunks
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
end_x = min(start_x + max_width, img_width)
|
| 263 |
|
| 264 |
-
#
|
| 265 |
-
if
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
|
|
|
| 270 |
|
| 271 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
return chunks
|
| 274 |
|
| 275 |
-
def process_pdf(pdf_bytes, split_wide: bool = True):
|
| 276 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
all_results = []
|
| 278 |
|
| 279 |
-
with tempfile.NamedTemporaryFile(suffix='.pdf'
|
| 280 |
tmp_file.write(pdf_bytes)
|
| 281 |
tmp_file.flush()
|
| 282 |
|
| 283 |
-
|
| 284 |
-
pdf_document = fitz.open(tmp_file.name)
|
| 285 |
-
except Exception as e:
|
| 286 |
-
os.unlink(tmp_file.name)
|
| 287 |
-
raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
|
| 288 |
-
|
| 289 |
-
# Configuration
|
| 290 |
-
RENDER_SCALE = 2.0
|
| 291 |
-
MAX_WIDTH = 2000 # Increased for better quality
|
| 292 |
-
MAX_TOKENS = 768 # Increased token limit for complex documents
|
| 293 |
|
| 294 |
for page_num in range(len(pdf_document)):
|
| 295 |
try:
|
| 296 |
page = pdf_document[page_num]
|
| 297 |
-
page_rect = page.rect
|
| 298 |
|
| 299 |
-
#
|
| 300 |
-
|
| 301 |
-
|
|
|
|
| 302 |
original_rotation = page.rotation
|
| 303 |
|
| 304 |
-
print(f"\
|
| 305 |
print(f" Original dimensions: {original_width}x{original_height}")
|
| 306 |
print(f" Rotation: {original_rotation}°")
|
| 307 |
|
| 308 |
-
# Render page
|
|
|
|
| 309 |
mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
|
| 310 |
pix = page.get_pixmap(matrix=mat)
|
| 311 |
img_data = pix.tobytes("png")
|
|
@@ -342,17 +330,33 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 342 |
chunk_width, chunk_height = chunk_image.size
|
| 343 |
print(f" Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
|
| 344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
# Process chunk with increased token limit
|
| 346 |
chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
|
| 347 |
-
print(f" Extracted {len(chunk_results)} items")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
|
| 349 |
# Transform chunk-relative coordinates to full page coordinates
|
| 350 |
for result in chunk_results:
|
| 351 |
bbox = result['bbox']
|
| 352 |
|
| 353 |
# Add chunk offset (in rendered image pixels)
|
|
|
|
| 354 |
bbox['x'] += x_offset
|
| 355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
# Scale from rendered image pixels to PDF points
|
| 357 |
# Use effective dimensions for proper scaling
|
| 358 |
bbox['x'] = bbox['x'] / RENDER_SCALE
|
|
@@ -362,6 +366,8 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 362 |
|
| 363 |
page_results.extend(chunk_results)
|
| 364 |
|
|
|
|
|
|
|
| 365 |
else:
|
| 366 |
# Process full page without splitting
|
| 367 |
print(" Processing full page without splitting...")
|
|
@@ -382,6 +388,11 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 382 |
unique_results = deduplicate_results(page_results)
|
| 383 |
print(f" After deduplication: {len(unique_results)} unique items")
|
| 384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
# Return results with both original and effective dimensions
|
| 386 |
all_results.append({
|
| 387 |
"page": page_num + 1,
|
|
@@ -394,6 +405,10 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 394 |
"height": effective_pdf_height
|
| 395 |
},
|
| 396 |
"rotation": original_rotation,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
"extractions": unique_results
|
| 398 |
})
|
| 399 |
|
|
|
|
| 208 |
# For rotated pages (90 or 270), the page has already been rotated in the rendered image
|
| 209 |
# So we just check the rendered dimensions directly
|
| 210 |
|
| 211 |
+
# Check if width exceeds max_width
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
if rendered_width > max_width:
|
| 213 |
+
return (True, "horizontal")
|
| 214 |
+
|
| 215 |
+
# Could add vertical splitting logic here if needed
|
| 216 |
+
# if rendered_height > max_height:
|
| 217 |
+
# return (True, "vertical")
|
|
|
|
| 218 |
|
| 219 |
+
return (False, None)
|
| 220 |
|
| 221 |
+
def split_image_intelligently(image: Image.Image, max_width: int,
|
| 222 |
+
overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
|
| 223 |
"""
|
| 224 |
+
Split image into overlapping chunks.
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
List of (chunk_image, x_offset) tuples where x_offset is the pixel position
|
| 228 |
+
in the original image where this chunk starts.
|
| 229 |
"""
|
| 230 |
img_width, img_height = image.size
|
| 231 |
+
|
| 232 |
+
if img_width <= max_width:
|
| 233 |
+
return [(image, 0)]
|
| 234 |
|
| 235 |
# Calculate overlap in pixels
|
| 236 |
overlap_pixels = int(max_width * overlap_ratio)
|
|
|
|
|
|
|
| 237 |
step_size = max_width - overlap_pixels
|
| 238 |
|
| 239 |
+
chunks = []
|
| 240 |
+
x_position = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
while x_position < img_width:
|
| 243 |
+
# Calculate the right edge of this chunk
|
| 244 |
+
right_edge = min(x_position + max_width, img_width)
|
|
|
|
| 245 |
|
| 246 |
+
# If this would be a very small last chunk, extend the previous chunk instead
|
| 247 |
+
if right_edge < img_width and (img_width - right_edge) < (max_width * 0.3):
|
| 248 |
+
right_edge = img_width
|
| 249 |
+
|
| 250 |
+
# Crop the chunk
|
| 251 |
+
chunk = image.crop((x_position, 0, right_edge, img_height))
|
| 252 |
+
chunks.append((chunk, x_position))
|
| 253 |
|
| 254 |
+
print(f" Created chunk at x={x_position}, width={right_edge - x_position}")
|
| 255 |
+
|
| 256 |
+
# If we've reached the end, break
|
| 257 |
+
if right_edge >= img_width:
|
| 258 |
+
break
|
| 259 |
+
|
| 260 |
+
# Move to next chunk position
|
| 261 |
+
x_position += step_size
|
| 262 |
|
| 263 |
return chunks
|
| 264 |
|
| 265 |
+
def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
| 266 |
+
"""
|
| 267 |
+
Process PDF and extract structured content.
|
| 268 |
+
"""
|
| 269 |
+
RENDER_SCALE = 3.0 # High resolution for better OCR
|
| 270 |
+
MAX_WIDTH = 2000 # Maximum width for a single chunk (in rendered pixels)
|
| 271 |
+
MAX_TOKENS = 768 # Increased from 512 for better coverage
|
| 272 |
+
|
| 273 |
all_results = []
|
| 274 |
|
| 275 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
| 276 |
tmp_file.write(pdf_bytes)
|
| 277 |
tmp_file.flush()
|
| 278 |
|
| 279 |
+
pdf_document = fitz.open(tmp_file.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
for page_num in range(len(pdf_document)):
|
| 282 |
try:
|
| 283 |
page = pdf_document[page_num]
|
|
|
|
| 284 |
|
| 285 |
+
# Get original page dimensions and rotation
|
| 286 |
+
original_rect = page.rect
|
| 287 |
+
original_width = original_rect.width
|
| 288 |
+
original_height = original_rect.height
|
| 289 |
original_rotation = page.rotation
|
| 290 |
|
| 291 |
+
print(f"\nProcessing page {page_num + 1}:")
|
| 292 |
print(f" Original dimensions: {original_width}x{original_height}")
|
| 293 |
print(f" Rotation: {original_rotation}°")
|
| 294 |
|
| 295 |
+
# Render page at high resolution
|
| 296 |
+
# PyMuPDF automatically handles rotation when rendering
|
| 297 |
mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
|
| 298 |
pix = page.get_pixmap(matrix=mat)
|
| 299 |
img_data = pix.tobytes("png")
|
|
|
|
| 330 |
chunk_width, chunk_height = chunk_image.size
|
| 331 |
print(f" Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
|
| 332 |
|
| 333 |
+
# IMPORTANT: Save chunk for debugging
|
| 334 |
+
if chunk_idx == 0:
|
| 335 |
+
print(f" DEBUG: Saving first chunk for inspection")
|
| 336 |
+
# chunk_image.save(f"/tmp/debug_chunk_{page_num}_{chunk_idx}.png")
|
| 337 |
+
|
| 338 |
# Process chunk with increased token limit
|
| 339 |
chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
|
| 340 |
+
print(f" Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
|
| 341 |
+
|
| 342 |
+
# DEBUG: Print sample of first few items
|
| 343 |
+
if chunk_results:
|
| 344 |
+
print(f" Sample items from chunk {chunk_idx + 1}:")
|
| 345 |
+
for i, item in enumerate(chunk_results[:3]):
|
| 346 |
+
print(f" Item {i+1}: text='{item['text']}', x={item['bbox']['x']:.1f}")
|
| 347 |
|
| 348 |
# Transform chunk-relative coordinates to full page coordinates
|
| 349 |
for result in chunk_results:
|
| 350 |
bbox = result['bbox']
|
| 351 |
|
| 352 |
# Add chunk offset (in rendered image pixels)
|
| 353 |
+
original_chunk_x = bbox['x']
|
| 354 |
bbox['x'] += x_offset
|
| 355 |
|
| 356 |
+
# DEBUG: Print transformation for first item in each chunk
|
| 357 |
+
if result == chunk_results[0]:
|
| 358 |
+
print(f" Coordinate transform: chunk_x={original_chunk_x:.1f} + offset={x_offset} = page_x={bbox['x']:.1f}")
|
| 359 |
+
|
| 360 |
# Scale from rendered image pixels to PDF points
|
| 361 |
# Use effective dimensions for proper scaling
|
| 362 |
bbox['x'] = bbox['x'] / RENDER_SCALE
|
|
|
|
| 366 |
|
| 367 |
page_results.extend(chunk_results)
|
| 368 |
|
| 369 |
+
print(f" Total items before deduplication: {len(page_results)}")
|
| 370 |
+
|
| 371 |
else:
|
| 372 |
# Process full page without splitting
|
| 373 |
print(" Processing full page without splitting...")
|
|
|
|
| 388 |
unique_results = deduplicate_results(page_results)
|
| 389 |
print(f" After deduplication: {len(unique_results)} unique items")
|
| 390 |
|
| 391 |
+
# DEBUG: Print x-coordinate range of results
|
| 392 |
+
if unique_results:
|
| 393 |
+
x_coords = [item['bbox']['x'] for item in unique_results]
|
| 394 |
+
print(f" X-coordinate range: {min(x_coords):.1f} to {max(x_coords):.1f}")
|
| 395 |
+
|
| 396 |
# Return results with both original and effective dimensions
|
| 397 |
all_results.append({
|
| 398 |
"page": page_num + 1,
|
|
|
|
| 405 |
"height": effective_pdf_height
|
| 406 |
},
|
| 407 |
"rotation": original_rotation,
|
| 408 |
+
"rendered_dimensions": {
|
| 409 |
+
"width": rendered_width,
|
| 410 |
+
"height": rendered_height
|
| 411 |
+
},
|
| 412 |
"extractions": unique_results
|
| 413 |
})
|
| 414 |
|