Alfonso Velasco commited on
Commit
2ddaa4e
·
1 Parent(s): 1af4bc8

fix coordinate scaling error

Browse files
Files changed (1) hide show
  1. app.py +33 -34
app.py CHANGED
@@ -43,7 +43,7 @@ except Exception as e:
43
  class DocumentRequest(BaseModel):
44
  pdf: str = None
45
  image: str = None
46
- split_wide_pages: bool = True # New option to split wide pages
47
 
48
  @app.get("/")
49
  def home():
@@ -66,10 +66,10 @@ async def extract_document(request: DocumentRequest):
66
  except Exception as e:
67
  raise HTTPException(status_code=500, detail=str(e))
68
 
69
- def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float = 0) -> List[Dict]:
70
  """
71
- Process a single image or image chunk and return extractions.
72
- offset_x and offset_y are used when processing chunks of a larger image.
73
  """
74
  img_width, img_height = image.size
75
 
@@ -78,7 +78,7 @@ def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float
78
  image,
79
  truncation=True,
80
  padding="max_length",
81
- max_length=1024, # Increased limit
82
  return_tensors="pt"
83
  )
84
  except Exception as e:
@@ -114,18 +114,12 @@ def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float
114
  if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
115
  continue
116
 
117
- # Convert to chunk coordinates
118
  x = (x_norm / 1000.0) * img_width
119
  y = (y_norm / 1000.0) * img_height
120
  x2 = (x2_norm / 1000.0) * img_width
121
  y2 = (y2_norm / 1000.0) * img_height
122
 
123
- # Add offset to get coordinates in full page space
124
- x += offset_x
125
- y += offset_y
126
- x2 += offset_x
127
- y2 += offset_y
128
-
129
  width = x2 - x
130
  height = y2 - y
131
 
@@ -192,46 +186,48 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
192
  chunk_width = MAX_WIDTH
193
 
194
  for chunk_idx in range(num_chunks):
195
- # Calculate chunk boundaries
196
  start_x = chunk_idx * (chunk_width - OVERLAP)
197
  end_x = min(start_x + chunk_width, img_width)
198
 
199
- # Crop chunk
200
  chunk = full_image.crop((start_x, 0, end_x, img_height))
201
 
202
  print(f" Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
203
 
204
- # Process chunk and adjust coordinates
205
- chunk_offset_pdf = start_x / RENDER_SCALE
206
- chunk_results = process_image_chunk(
207
- chunk,
208
- offset_x=chunk_offset_pdf,
209
- offset_y=0
210
- )
211
 
212
- # Scale coordinates back to PDF space
213
  for result in chunk_results:
214
  bbox = result['bbox']
215
- bbox['x'] /= RENDER_SCALE
216
- bbox['y'] /= RENDER_SCALE
217
- bbox['width'] /= RENDER_SCALE
218
- bbox['height'] /= RENDER_SCALE
 
 
 
 
 
 
 
219
 
220
  page_results.extend(chunk_results)
221
 
222
  print(f" Total extractions from all chunks: {len(page_results)}")
223
 
224
  else:
225
- # Process full page
226
- chunk_results = process_image_chunk(full_image, 0, 0)
227
 
228
- # Scale coordinates back to PDF space
229
  for result in chunk_results:
230
  bbox = result['bbox']
231
- bbox['x'] = (bbox['x'] / img_width) * page_width
232
- bbox['y'] = (bbox['y'] / img_height) * page_height
233
- bbox['width'] = (bbox['width'] / img_width) * page_width
234
- bbox['height'] = (bbox['height'] / img_height) * page_height
235
 
236
  page_results = chunk_results
237
 
@@ -278,7 +274,10 @@ def process_image(image_bytes):
278
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
279
  img_width, img_height = image.size
280
 
281
- results = process_image_chunk(image, 0, 0)
 
 
 
282
 
283
  return {
284
  "document_type": "image",
 
43
  class DocumentRequest(BaseModel):
44
  pdf: str = None
45
  image: str = None
46
+ split_wide_pages: bool = True
47
 
48
  @app.get("/")
49
  def home():
 
66
  except Exception as e:
67
  raise HTTPException(status_code=500, detail=str(e))
68
 
69
+ def process_image_chunk(image: Image.Image) -> List[Dict]:
70
  """
71
+ Process a single image or image chunk and return extractions with coordinates
72
+ relative to the chunk (0,0 at top-left of chunk).
73
  """
74
  img_width, img_height = image.size
75
 
 
78
  image,
79
  truncation=True,
80
  padding="max_length",
81
+ max_length=1024,
82
  return_tensors="pt"
83
  )
84
  except Exception as e:
 
114
  if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
115
  continue
116
 
117
+ # Convert normalized coordinates to chunk pixel coordinates
118
  x = (x_norm / 1000.0) * img_width
119
  y = (y_norm / 1000.0) * img_height
120
  x2 = (x2_norm / 1000.0) * img_width
121
  y2 = (y2_norm / 1000.0) * img_height
122
 
 
 
 
 
 
 
123
  width = x2 - x
124
  height = y2 - y
125
 
 
186
  chunk_width = MAX_WIDTH
187
 
188
  for chunk_idx in range(num_chunks):
189
+ # Calculate chunk boundaries in rendered image pixels
190
  start_x = chunk_idx * (chunk_width - OVERLAP)
191
  end_x = min(start_x + chunk_width, img_width)
192
 
193
+ # Crop chunk from rendered image
194
  chunk = full_image.crop((start_x, 0, end_x, img_height))
195
 
196
  print(f" Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
197
 
198
+ # Process chunk (returns coordinates relative to chunk)
199
+ chunk_results = process_image_chunk(chunk)
 
 
 
 
 
200
 
201
+ # Transform chunk-relative coordinates to full page coordinates
202
  for result in chunk_results:
203
  bbox = result['bbox']
204
+
205
+ # Add chunk offset (in rendered image pixels)
206
+ bbox['x'] += start_x
207
+ # y stays the same (no vertical splitting)
208
+ # bbox['y'] is already correct
209
+
210
+ # Now scale from rendered image pixels to PDF points
211
+ bbox['x'] = bbox['x'] / RENDER_SCALE
212
+ bbox['y'] = bbox['y'] / RENDER_SCALE
213
+ bbox['width'] = bbox['width'] / RENDER_SCALE
214
+ bbox['height'] = bbox['height'] / RENDER_SCALE
215
 
216
  page_results.extend(chunk_results)
217
 
218
  print(f" Total extractions from all chunks: {len(page_results)}")
219
 
220
  else:
221
+ # Process full page (no splitting needed)
222
+ chunk_results = process_image_chunk(full_image)
223
 
224
+ # Scale coordinates from rendered image pixels to PDF points
225
  for result in chunk_results:
226
  bbox = result['bbox']
227
+ bbox['x'] = bbox['x'] / RENDER_SCALE
228
+ bbox['y'] = bbox['y'] / RENDER_SCALE
229
+ bbox['width'] = bbox['width'] / RENDER_SCALE
230
+ bbox['height'] = bbox['height'] / RENDER_SCALE
231
 
232
  page_results = chunk_results
233
 
 
274
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
275
  img_width, img_height = image.size
276
 
277
+ # Process the image
278
+ results = process_image_chunk(image)
279
+
280
+ # Coordinates are already in image pixels, no scaling needed for standalone images
281
 
282
  return {
283
  "document_type": "image",