Alfonso Velasco commited on
Commit
10a2064
·
1 Parent(s): 50304f8

fix chunk

Browse files
Files changed (1) hide show
  1. app.py +65 -45
app.py CHANGED
@@ -78,58 +78,78 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
78
  print(f"Invalid image dimensions: {img_width}x{img_height}")
79
  return []
80
 
81
- try:
82
- encoding = processor(
83
- image,
84
- truncation=True,
85
- padding="max_length",
86
- max_length=max_tokens,
87
- return_tensors="pt"
88
- )
89
- except Exception as e:
90
- print(f"OCR failed: {e}, using fallback")
91
  try:
92
  encoding = processor(
93
  image,
94
- text=[""] * max_tokens,
95
- boxes=[[0, 0, 0, 0]] * max_tokens,
96
  truncation=True,
97
  padding="max_length",
98
- max_length=max_tokens,
99
  return_tensors="pt"
100
  )
101
- except Exception as e2:
102
- print(f"Fallback also failed: {e2}")
103
- return []
104
-
105
- encoding_device = {}
106
- for k, v in encoding.items():
107
- if isinstance(v, torch.Tensor):
108
- encoding_device[k] = v.to(device)
109
- if k == "bbox":
110
- encoding_device[k] = torch.clamp(encoding_device[k], 0, 1000)
111
-
112
- encoding = encoding_device
113
-
114
- try:
115
- with torch.no_grad():
116
- outputs = model(**encoding)
117
- except RuntimeError as e:
118
- if "CUDA" in str(e):
119
- print(f"CUDA error encountered: {e}")
120
- encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
121
- model.cpu()
 
 
 
 
 
 
 
 
 
122
  with torch.no_grad():
123
  outputs = model(**encoding)
124
- model.to(device)
125
- elif "index out of range" in str(e):
126
- print(f"Index error in model processing: {e}")
127
- return []
128
- else:
129
- raise
130
- except Exception as e:
131
- print(f"Unexpected error in model processing: {e}")
132
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  try:
135
  tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
@@ -230,8 +250,8 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
230
  actual render, then transform them to the effective coordinate space.
231
  """
232
  RENDER_SCALE = 3.0
233
- MAX_WIDTH = 2000 # Maximum width for a chunk in rendered pixels
234
- MAX_TOKENS = 768
235
 
236
  all_results = []
237
 
 
78
  print(f"Invalid image dimensions: {img_width}x{img_height}")
79
  return []
80
 
81
+ # Try multiple token limits if we hit errors
82
+ token_limits = [max_tokens, 384, 256] if max_tokens > 256 else [max_tokens]
83
+
84
+ for token_limit in token_limits:
 
 
 
 
 
 
85
  try:
86
  encoding = processor(
87
  image,
 
 
88
  truncation=True,
89
  padding="max_length",
90
+ max_length=token_limit,
91
  return_tensors="pt"
92
  )
93
+ except Exception as e:
94
+ print(f"OCR failed with max_tokens={token_limit}: {e}")
95
+ if token_limit == token_limits[-1]:
96
+ # Last attempt, try fallback
97
+ try:
98
+ encoding = processor(
99
+ image,
100
+ text=[""] * token_limit,
101
+ boxes=[[0, 0, 0, 0]] * token_limit,
102
+ truncation=True,
103
+ padding="max_length",
104
+ max_length=token_limit,
105
+ return_tensors="pt"
106
+ )
107
+ except Exception as e2:
108
+ print(f"Fallback also failed: {e2}")
109
+ return []
110
+ else:
111
+ continue
112
+
113
+ encoding_device = {}
114
+ for k, v in encoding.items():
115
+ if isinstance(v, torch.Tensor):
116
+ encoding_device[k] = v.to(device)
117
+ if k == "bbox":
118
+ encoding_device[k] = torch.clamp(encoding_device[k], 0, 1000)
119
+
120
+ encoding = encoding_device
121
+
122
+ try:
123
  with torch.no_grad():
124
  outputs = model(**encoding)
125
+ # Success! Break out of retry loop
126
+ break
127
+ except RuntimeError as e:
128
+ error_str = str(e)
129
+ if "CUDA" in error_str:
130
+ print(f"CUDA error encountered: {e}")
131
+ encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
132
+ model.cpu()
133
+ with torch.no_grad():
134
+ outputs = model(**encoding)
135
+ model.to(device)
136
+ break
137
+ elif "index out of range" in error_str:
138
+ print(f"Index error with max_tokens={token_limit}: {e}")
139
+ if token_limit == token_limits[-1]:
140
+ print(f"All token limits exhausted, returning empty results")
141
+ return []
142
+ else:
143
+ print(f"Retrying with smaller token limit...")
144
+ continue
145
+ else:
146
+ raise
147
+ except Exception as e:
148
+ print(f"Unexpected error in model processing: {e}")
149
+ if token_limit == token_limits[-1]:
150
+ return []
151
+ else:
152
+ continue
153
 
154
  try:
155
  tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
 
250
  actual render, then transform them to the effective coordinate space.
251
  """
252
  RENDER_SCALE = 3.0
253
+ MAX_WIDTH = 1800 # Maximum width for a chunk in rendered pixels (reduced to ensure splitting)
254
+ MAX_TOKENS = 512 # Reduced to prevent index out of range errors with large images
255
 
256
  all_results = []
257