Spaces:

alfonsovelp
/

llm_document

Sleeping

App Files Files Community

Alfonso Velasco commited on Oct 23, 2025

Commit

10a2064

1 Parent(s): 50304f8

fix chunk

Browse files

Files changed (1) hide show

app.py +65 -45

app.py CHANGED Viewed

@@ -78,58 +78,78 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
         print(f"Invalid image dimensions: {img_width}x{img_height}")
         return []
-    try:
-        encoding = processor(
-            image,
-            truncation=True,
-            padding="max_length",
-            max_length=max_tokens,
-            return_tensors="pt"
-        )
-    except Exception as e:
-        print(f"OCR failed: {e}, using fallback")
         try:
             encoding = processor(
                 image,
-                text=[""] * max_tokens,
-                boxes=[[0, 0, 0, 0]] * max_tokens,
                 truncation=True,
                 padding="max_length",
-                max_length=max_tokens,
                 return_tensors="pt"
             )
-        except Exception as e2:
-            print(f"Fallback also failed: {e2}")
-            return []
-    encoding_device = {}
-    for k, v in encoding.items():
-        if isinstance(v, torch.Tensor):
-            encoding_device[k] = v.to(device)
-            if k == "bbox":
-                encoding_device[k] = torch.clamp(encoding_device[k], 0, 1000)
-    encoding = encoding_device
-    try:
-        with torch.no_grad():
-            outputs = model(**encoding)
-    except RuntimeError as e:
-        if "CUDA" in str(e):
-            print(f"CUDA error encountered: {e}")
-            encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
-            model.cpu()
             with torch.no_grad():
                 outputs = model(**encoding)
-            model.to(device)
-        elif "index out of range" in str(e):
-            print(f"Index error in model processing: {e}")
-            return []
-        else:
-            raise
-    except Exception as e:
-        print(f"Unexpected error in model processing: {e}")
-        return []
     try:
         tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
@@ -230,8 +250,8 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
     actual render, then transform them to the effective coordinate space.
     """
     RENDER_SCALE = 3.0
-    MAX_WIDTH = 2000  # Maximum width for a chunk in rendered pixels
-    MAX_TOKENS = 768
     all_results = []

         print(f"Invalid image dimensions: {img_width}x{img_height}")
         return []
+    # Try multiple token limits if we hit errors
+    token_limits = [max_tokens, 384, 256] if max_tokens > 256 else [max_tokens]
+    for token_limit in token_limits:
         try:
             encoding = processor(
                 image,
                 truncation=True,
                 padding="max_length",
+                max_length=token_limit,
                 return_tensors="pt"
             )
+        except Exception as e:
+            print(f"OCR failed with max_tokens={token_limit}: {e}")
+            if token_limit == token_limits[-1]:
+                # Last attempt, try fallback
+                try:
+                    encoding = processor(
+                        image,
+                        text=[""] * token_limit,
+                        boxes=[[0, 0, 0, 0]] * token_limit,
+                        truncation=True,
+                        padding="max_length",
+                        max_length=token_limit,
+                        return_tensors="pt"
+                    )
+                except Exception as e2:
+                    print(f"Fallback also failed: {e2}")
+                    return []
+            else:
+                continue
+        encoding_device = {}
+        for k, v in encoding.items():
+            if isinstance(v, torch.Tensor):
+                encoding_device[k] = v.to(device)
+                if k == "bbox":
+                    encoding_device[k] = torch.clamp(encoding_device[k], 0, 1000)
+        encoding = encoding_device
+        try:
             with torch.no_grad():
                 outputs = model(**encoding)
+            # Success! Break out of retry loop
+            break
+        except RuntimeError as e:
+            error_str = str(e)
+            if "CUDA" in error_str:
+                print(f"CUDA error encountered: {e}")
+                encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
+                model.cpu()
+                with torch.no_grad():
+                    outputs = model(**encoding)
+                model.to(device)
+                break
+            elif "index out of range" in error_str:
+                print(f"Index error with max_tokens={token_limit}: {e}")
+                if token_limit == token_limits[-1]:
+                    print(f"All token limits exhausted, returning empty results")
+                    return []
+                else:
+                    print(f"Retrying with smaller token limit...")
+                    continue
+            else:
+                raise
+        except Exception as e:
+            print(f"Unexpected error in model processing: {e}")
+            if token_limit == token_limits[-1]:
+                return []
+            else:
+                continue
     try:
         tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
     actual render, then transform them to the effective coordinate space.
     """
     RENDER_SCALE = 3.0
+    MAX_WIDTH = 1800  # Maximum width for a chunk in rendered pixels (reduced to ensure splitting)
+    MAX_TOKENS = 512  # Reduced to prevent index out of range errors with large images
     all_results = []