Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

pavansuresh commited on Jul 9, 2025

Commit

94d30f1

verified ·

1 Parent(s): c842ecc

Update ai_mapping.py

Files changed (1) hide show

ai_mapping.py CHANGED Viewed

@@ -19,7 +19,8 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
     """
     Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
     Args:
-        page_data (list): List of dictionaries with 'text' (str), 'words' (list of str), and 'bbox' (list of [x0, y0, x1, y1]) per page.
         pdf_path (str): Path to the PDF file.
     Returns:
         dict: Key-value pairs extracted from the document.
@@ -46,7 +47,7 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
             pix.save(img_path)
             image = Image.open(img_path).convert("RGB")
-            # Tokenize words with corresponding bounding boxes
             words = page_info["words"]
             bboxes = page_info["bbox"]
             encoding = tokenizer(
@@ -65,12 +66,6 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
             image_encoding = feature_extractor(image, return_tensors="pt")
             pixel_values = image_encoding["pixel_values"]
-            # Ensure bbox length matches input_ids
-            if len(bbox[0]) < len(input_ids[0]):
-                bbox = torch.cat([bbox, torch.zeros((1, len(input_ids[0]) - len(bbox[0]), 4), dtype=torch.int64)])
-            elif len(bbox[0]) > len(input_ids[0]):
-                bbox = bbox[:, :len(input_ids[0])]
             # Pass inputs to the model
             with torch.no_grad():
                 outputs = model(

     """
     Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
     Args:
+        page_data (list): List of dictionaries with 'text' (str), 'words' (list of str),
+                          'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) per page.
         pdf_path (str): Path to the PDF file.
     Returns:
         dict: Key-value pairs extracted from the document.
             pix.save(img_path)
             image = Image.open(img_path).convert("RGB")
+            # Tokenize words with corresponding normalized bounding boxes
             words = page_info["words"]
             bboxes = page_info["bbox"]
             encoding = tokenizer(
             image_encoding = feature_extractor(image, return_tensors="pt")
             pixel_values = image_encoding["pixel_values"]
             # Pass inputs to the model
             with torch.no_grad():
                 outputs = model(