Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 9, 2025

Commit

8822f53

verified ·

1 Parent(s): 27937fa

Update ai_mapping.py

Browse files

Files changed (1) hide show

ai_mapping.py +21 -13

ai_mapping.py CHANGED Viewed

@@ -15,17 +15,18 @@ tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
 feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
 model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
-def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str, str]:
     """
     Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
     Args:
-        text_data (str): Extracted text from PDF.
         pdf_path (str): Path to the PDF file.
     Returns:
         dict: Key-value pairs extracted from the document.
     """
     try:
-        # Fallback to regex if model is untrained
         key_values = {}
         dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text_data)
         amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
@@ -34,17 +35,23 @@ def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str,
         # Attempt LayoutLMv3 processing
         doc = fitz.open(pdf_path)
-        for page_num in range(len(doc)):
             page = doc[page_num]
             pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
             img_path = f"{pdf_path}_page_{page_num}.png"
             pix.save(img_path)
             image = Image.open(img_path).convert("RGB")
-            # Tokenize text
-            words = text_data.splitlines()
             encoding = tokenizer(
                 words,
                 return_tensors="pt",
                 truncation=True,
                 padding=True,
@@ -52,16 +59,17 @@ def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str,
             )
             input_ids = encoding["input_ids"]
             attention_mask = encoding["attention_mask"]
-            # Process image to get bounding boxes
             image_encoding = feature_extractor(image, return_tensors="pt")
-            bbox = image_encoding["bbox"][0]  # Shape: (num_tokens, 4)
             # Ensure bbox length matches input_ids
-            if len(bbox) < len(input_ids[0]):
-                bbox = torch.cat([bbox, torch.zeros((len(input_ids[0]) - len(bbox), 4), dtype=torch.int64)])
-            elif len(bbox) > len(input_ids[0]):
-                bbox = bbox[:len(input_ids[0])]
             # Pass inputs to the model
             with torch.no_grad():
@@ -69,7 +77,7 @@ def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str,
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     bbox=bbox,
-                    pixel_values=image_encoding["pixel_values"]
                 )
                 predictions = torch.argmax(outputs.logits, dim=2)

 feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
 model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
+def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str, str]:
     """
     Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
     Args:
+        page_data (list): List of dictionaries with 'text' (str) and 'bbox' (list of [x0, y0, x1, y1]) per page.
         pdf_path (str): Path to the PDF file.
     Returns:
         dict: Key-value pairs extracted from the document.
     """
     try:
+        # Fallback to regex using concatenated text from all pages
+        text_data = " ".join([page["text"] for page in page_data])
         key_values = {}
         dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text_data)
         amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
         # Attempt LayoutLMv3 processing
         doc = fitz.open(pdf_path)
+        for page_num, page_info in enumerate(page_data):
+            if not page_info["text"].strip() or "No text detected" in page_info["text"]:
+                continue
+            # Load image for the page
             page = doc[page_num]
             pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
             img_path = f"{pdf_path}_page_{page_num}.png"
             pix.save(img_path)
             image = Image.open(img_path).convert("RGB")
+            # Tokenize text and prepare bounding boxes
+            words = page_info["text"].split()
+            bboxes = page_info["bbox"]
             encoding = tokenizer(
                 words,
+                boxes=bboxes,
                 return_tensors="pt",
                 truncation=True,
                 padding=True,
             )
             input_ids = encoding["input_ids"]
             attention_mask = encoding["attention_mask"]
+            bbox = encoding["bbox"]
+            # Process image for pixel values
             image_encoding = feature_extractor(image, return_tensors="pt")
+            pixel_values = image_encoding["pixel_values"]
             # Ensure bbox length matches input_ids
+            if len(bbox[0]) < len(input_ids[0]):
+                bbox = torch.cat([bbox, torch.zeros((1, len(input_ids[0]) - len(bbox[0]), 4), dtype=torch.int64)])
+            elif len(bbox[0]) > len(input_ids[0]):
+                bbox = bbox[:, :len(input_ids[0])]
             # Pass inputs to the model
             with torch.no_grad():
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     bbox=bbox,
+                    pixel_values=pixel_values
                 )
                 predictions = torch.argmax(outputs.logits, dim=2)