Spaces:

heerjtdev
/

layout_latex

Sleeping

App Files Files Community

heerjtdev commited on 22 days ago

Commit

bbc046a

verified ·

1 Parent(s): 66a0ed8

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +19 -26

working_yolo_pipeline.py CHANGED Viewed

@@ -2075,10 +2075,6 @@ def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
     doc = fitz.open("pdf", pdf_stream.read())
     return doc, doc[0]
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
     Modified pipeline that handles both PDFs and Images, running YOLO,
@@ -2088,7 +2084,6 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     yolo_model = YOLO(WEIGHTS_PATH)
     # 2. DETECT FILE TYPE
-    # FIX: [1] added to get the extension string from the (root, ext) tuple
     ext = os.path.splitext(input_path)[1].lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
@@ -2098,10 +2093,8 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     try:
         if is_image:
             print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
-            # Use the corrected helper function defined above
             doc, page = load_image_as_fitz_page(input_path)
-            # Render for YOLO
             pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
             img_np = pixmap_to_numpy(pix)
@@ -2112,7 +2105,6 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
                 all_pages_data.append(page_data)
             doc.close()
         else:
-            # --- ORIGINAL PDF LOGIC ---
             doc = fitz.open(input_path)
             print(f"📄 Processing PDF: {pdf_name} ({len(doc)} pages)")
             for page_index in range(len(doc)):
@@ -2131,26 +2123,14 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
             print("❌ No data extracted.")
             return None
-        # # 3. CONSOLIDATE BLOCKS FOR INFERENCE
-        # sequential_blocks = []
-        # for p_data in all_pages_data:
-        #     sequential_blocks.extend(p_data.get('blocks', []))
-        # 3. CONSOLIDATE BLOCKS FOR INFERENCE
         sequential_blocks = []
         for p_data in all_pages_data:
             if isinstance(p_data, dict):
-                # If it's a dictionary, extract the 'blocks' key
                 blocks = p_data.get('blocks', [])
                 sequential_blocks.extend(blocks)
             elif isinstance(p_data, list):
-                # If it's already a list, add it directly
                 sequential_blocks.extend(p_data)
-            else:
-                print(f"⚠️ Warning: Unexpected data type in all_pages_data: {type(p_data)}")
         # --- 4. STARTING LAYOUTLMV3 INFERENCE ---
         print("\n" + "=" * 80)
@@ -2160,10 +2140,26 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
         tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Note: Ensure LayoutLMv3ForTokenClassification is defined in your script
         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
         checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
-        model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
         model.to(device)
         model.eval()
@@ -2178,7 +2174,6 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
         return final_result
     except Exception as e:
-        # Improved error logging to catch exactly where it fails
         import traceback
         traceback.print_exc()
         print(f"❌ FATAL ERROR in pipeline: {e}")
@@ -2186,8 +2181,6 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
 # #================================================================================
 #         # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---

     doc = fitz.open("pdf", pdf_stream.read())
     return doc, doc[0]
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
     Modified pipeline that handles both PDFs and Images, running YOLO,
     yolo_model = YOLO(WEIGHTS_PATH)
     # 2. DETECT FILE TYPE
     ext = os.path.splitext(input_path)[1].lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
     try:
         if is_image:
             print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
             doc, page = load_image_as_fitz_page(input_path)
             pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
             img_np = pixmap_to_numpy(pix)
                 all_pages_data.append(page_data)
             doc.close()
         else:
             doc = fitz.open(input_path)
             print(f"📄 Processing PDF: {pdf_name} ({len(doc)} pages)")
             for page_index in range(len(doc)):
             print("❌ No data extracted.")
             return None
+        # 3. CONSOLIDATE BLOCKS FOR INFERENCE (Safe against List vs Dict)
         sequential_blocks = []
         for p_data in all_pages_data:
             if isinstance(p_data, dict):
                 blocks = p_data.get('blocks', [])
                 sequential_blocks.extend(blocks)
             elif isinstance(p_data, list):
                 sequential_blocks.extend(p_data)
         # --- 4. STARTING LAYOUTLMV3 INFERENCE ---
         print("\n" + "=" * 80)
         tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
+        # --- FIX: ROBUST KEY REMAPPING FOR LAYOUTLMV3 ---
         checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
+        state_dict = checkpoint.get('model_state_dict', checkpoint)
+        # Rename keys from 'layoutlm.xxx' to 'layoutlmv3.xxx' if necessary
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            if key.startswith("layoutlm."):
+                new_key = key.replace("layoutlm.", "layoutlmv3.", 1)
+                new_state_dict[new_key] = value
+            else:
+                new_state_dict[key] = value
+        # Load with strict=False to handle minor metadata differences
+        model.load_state_dict(new_state_dict, strict=False)
+        # -----------------------------------------------
         model.to(device)
         model.eval()
         return final_result
     except Exception as e:
         import traceback
         traceback.print_exc()
         print(f"❌ FATAL ERROR in pipeline: {e}")
 # #================================================================================
 #         # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---