Spaces:
Sleeping
Sleeping
Alfonso Velasco commited on
Commit ·
10a2064
1
Parent(s): 50304f8
fix chunk
Browse files
app.py
CHANGED
|
@@ -78,58 +78,78 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
|
|
| 78 |
print(f"Invalid image dimensions: {img_width}x{img_height}")
|
| 79 |
return []
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
padding="max_length",
|
| 86 |
-
max_length=max_tokens,
|
| 87 |
-
return_tensors="pt"
|
| 88 |
-
)
|
| 89 |
-
except Exception as e:
|
| 90 |
-
print(f"OCR failed: {e}, using fallback")
|
| 91 |
try:
|
| 92 |
encoding = processor(
|
| 93 |
image,
|
| 94 |
-
text=[""] * max_tokens,
|
| 95 |
-
boxes=[[0, 0, 0, 0]] * max_tokens,
|
| 96 |
truncation=True,
|
| 97 |
padding="max_length",
|
| 98 |
-
max_length=
|
| 99 |
return_tensors="pt"
|
| 100 |
)
|
| 101 |
-
except Exception as
|
| 102 |
-
print(f"
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
with torch.no_grad():
|
| 123 |
outputs = model(**encoding)
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
try:
|
| 135 |
tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
|
|
@@ -230,8 +250,8 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 230 |
actual render, then transform them to the effective coordinate space.
|
| 231 |
"""
|
| 232 |
RENDER_SCALE = 3.0
|
| 233 |
-
MAX_WIDTH =
|
| 234 |
-
MAX_TOKENS =
|
| 235 |
|
| 236 |
all_results = []
|
| 237 |
|
|
|
|
| 78 |
print(f"Invalid image dimensions: {img_width}x{img_height}")
|
| 79 |
return []
|
| 80 |
|
| 81 |
+
# Try multiple token limits if we hit errors
|
| 82 |
+
token_limits = [max_tokens, 384, 256] if max_tokens > 256 else [max_tokens]
|
| 83 |
+
|
| 84 |
+
for token_limit in token_limits:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
try:
|
| 86 |
encoding = processor(
|
| 87 |
image,
|
|
|
|
|
|
|
| 88 |
truncation=True,
|
| 89 |
padding="max_length",
|
| 90 |
+
max_length=token_limit,
|
| 91 |
return_tensors="pt"
|
| 92 |
)
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"OCR failed with max_tokens={token_limit}: {e}")
|
| 95 |
+
if token_limit == token_limits[-1]:
|
| 96 |
+
# Last attempt, try fallback
|
| 97 |
+
try:
|
| 98 |
+
encoding = processor(
|
| 99 |
+
image,
|
| 100 |
+
text=[""] * token_limit,
|
| 101 |
+
boxes=[[0, 0, 0, 0]] * token_limit,
|
| 102 |
+
truncation=True,
|
| 103 |
+
padding="max_length",
|
| 104 |
+
max_length=token_limit,
|
| 105 |
+
return_tensors="pt"
|
| 106 |
+
)
|
| 107 |
+
except Exception as e2:
|
| 108 |
+
print(f"Fallback also failed: {e2}")
|
| 109 |
+
return []
|
| 110 |
+
else:
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
encoding_device = {}
|
| 114 |
+
for k, v in encoding.items():
|
| 115 |
+
if isinstance(v, torch.Tensor):
|
| 116 |
+
encoding_device[k] = v.to(device)
|
| 117 |
+
if k == "bbox":
|
| 118 |
+
encoding_device[k] = torch.clamp(encoding_device[k], 0, 1000)
|
| 119 |
+
|
| 120 |
+
encoding = encoding_device
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
with torch.no_grad():
|
| 124 |
outputs = model(**encoding)
|
| 125 |
+
# Success! Break out of retry loop
|
| 126 |
+
break
|
| 127 |
+
except RuntimeError as e:
|
| 128 |
+
error_str = str(e)
|
| 129 |
+
if "CUDA" in error_str:
|
| 130 |
+
print(f"CUDA error encountered: {e}")
|
| 131 |
+
encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
|
| 132 |
+
model.cpu()
|
| 133 |
+
with torch.no_grad():
|
| 134 |
+
outputs = model(**encoding)
|
| 135 |
+
model.to(device)
|
| 136 |
+
break
|
| 137 |
+
elif "index out of range" in error_str:
|
| 138 |
+
print(f"Index error with max_tokens={token_limit}: {e}")
|
| 139 |
+
if token_limit == token_limits[-1]:
|
| 140 |
+
print(f"All token limits exhausted, returning empty results")
|
| 141 |
+
return []
|
| 142 |
+
else:
|
| 143 |
+
print(f"Retrying with smaller token limit...")
|
| 144 |
+
continue
|
| 145 |
+
else:
|
| 146 |
+
raise
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"Unexpected error in model processing: {e}")
|
| 149 |
+
if token_limit == token_limits[-1]:
|
| 150 |
+
return []
|
| 151 |
+
else:
|
| 152 |
+
continue
|
| 153 |
|
| 154 |
try:
|
| 155 |
tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
|
|
|
|
| 250 |
actual render, then transform them to the effective coordinate space.
|
| 251 |
"""
|
| 252 |
RENDER_SCALE = 3.0
|
| 253 |
+
MAX_WIDTH = 1800 # Maximum width for a chunk in rendered pixels (reduced to ensure splitting)
|
| 254 |
+
MAX_TOKENS = 512 # Reduced to prevent index out of range errors with large images
|
| 255 |
|
| 256 |
all_results = []
|
| 257 |
|