Spaces:

heerjtdev
/

layout_latex

Sleeping

App Files Files Community

heerjtdev commited on 2 days ago

Commit

b5a5969

verified ·

1 Parent(s): becd980

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +142 -17

working_yolo_pipeline.py CHANGED Viewed

@@ -2420,6 +2420,137 @@ import time
 import traceback
 import glob
 def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
     if not os.path.exists(input_pdf_path):
         print(f"❌ ERROR: File not found: {input_pdf_path}")
@@ -2438,14 +2569,10 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
-    # If the user didn't provide a path, create one in the temp directory
     if structured_intermediate_output_path is None:
         structured_intermediate_output_path = os.path.join(
             temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
         )
     final_result = None
     try:
@@ -2468,7 +2595,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
             print("❌ FAILED at Step 2: Inference returned no data.")
             return None
-        # Save raw predictions for Step 3
         with open(raw_output_path, 'w', encoding='utf-8') as f:
             json.dump(page_raw_predictions_list, f, indent=4)
         print(f"✅ Step 2 Complete ({time.time() - p2_start:.2f}s)")
@@ -2483,7 +2609,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
             print("❌ FAILED at Step 3: BIO conversion failed.")
             return None
-        # Logic adjustments
         print("... Correcting misalignments and linking context ...")
         structured_data_list = correct_misaligned_options(structured_data_list)
         structured_data_list = process_context_linking(structured_data_list)
@@ -2498,20 +2623,11 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
             return None
         print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
-        # --- ADD THIS NEW STEP HERE ---
         print(f"\n[Step 4.5/5] Adding Question Type Classification...")
         p4_5_start = time.time()
         final_result = add_question_type_validation(final_result)
         print(f"✅ Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
-        # --- END OF NEW STEP ---
         # --- Phase 5: Hierarchical Tagging ---
         print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
@@ -2523,6 +2639,16 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
         else:
             print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
     except Exception as e:
         print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
         print(f"Error Message: {str(e)}")
@@ -2550,7 +2676,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Complete Pipeline")
     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")

 import traceback
 import glob
+# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
+#     if not os.path.exists(input_pdf_path):
+#         print(f"❌ ERROR: File not found: {input_pdf_path}")
+#         return None
+#     print("\n" + "#" * 80)
+#     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
+#     print(f"Input: {input_pdf_path}")
+#     print("#" * 80)
+#     overall_start = time.time()
+#     pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
+#     temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
+#     os.makedirs(temp_pipeline_dir, exist_ok=True)
+#     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
+#     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
+#     # If the user didn't provide a path, create one in the temp directory
+#     if structured_intermediate_output_path is None:
+#         structured_intermediate_output_path = os.path.join(
+#             temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
+#         )
+#     final_result = None
+#     try:
+#         # --- Phase 1: Preprocessing ---
+#         print(f"\n[Step 1/5] Preprocessing (YOLO + Masking)...")
+#         p1_start = time.time()
+#         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
+#         if not preprocessed_json_path_out:
+#             print("❌ FAILED at Step 1: Preprocessing returned None.")
+#             return None
+#         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
+#         # --- Phase 2: Inference ---
+#         print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
+#         p2_start = time.time()
+#         page_raw_predictions_list = run_inference_and_get_raw_words(
+#             input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
+#         )
+#         if not page_raw_predictions_list:
+#             print("❌ FAILED at Step 2: Inference returned no data.")
+#             return None
+#         # Save raw predictions for Step 3
+#         with open(raw_output_path, 'w', encoding='utf-8') as f:
+#             json.dump(page_raw_predictions_list, f, indent=4)
+#         print(f"✅ Step 2 Complete ({time.time() - p2_start:.2f}s)")
+#         # --- Phase 3: Decoding ---
+#         print(f"\n[Step 3/5] Decoding (BIO to Structured JSON)...")
+#         p3_start = time.time()
+#         structured_data_list = convert_bio_to_structured_json_relaxed(
+#             raw_output_path, structured_intermediate_output_path
+#         )
+#         if not structured_data_list:
+#             print("❌ FAILED at Step 3: BIO conversion failed.")
+#             return None
+#         # Logic adjustments
+#         print("... Correcting misalignments and linking context ...")
+#         structured_data_list = correct_misaligned_options(structured_data_list)
+#         structured_data_list = process_context_linking(structured_data_list)
+#         print(f"✅ Step 3 Complete ({time.time() - p3_start:.2f}s)")
+#         # --- Phase 4: Base64 & LaTeX ---
+#         print(f"\n[Step 4/5] Finalizing Layout (Base64 Images & LaTeX)...")
+#         p4_start = time.time()
+#         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
+#         if not final_result:
+#             print("❌ FAILED at Step 4: Final formatting failed.")
+#             return None
+#         print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
+#         # --- ADD THIS NEW STEP HERE ---
+#         print(f"\n[Step 4.5/5] Adding Question Type Classification...")
+#         p4_5_start = time.time()
+#         final_result = add_question_type_validation(final_result)
+#         print(f"✅ Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
+#         # --- END OF NEW STEP ---
+#         # --- Phase 5: Hierarchical Tagging ---
+#         print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
+#         p5_start = time.time()
+#         classifier = HierarchicalClassifier()
+#         if classifier.load_models():
+#             final_result = post_process_json_with_inference(final_result, classifier)
+#             print(f"✅ Step 5 Complete: Tags added ({time.time() - p5_start:.2f}s)")
+#         else:
+#             print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
+#     except Exception as e:
+#         print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
+#         print(f"Error Message: {str(e)}")
+#         traceback.print_exc()
+#         return None
+#     finally:
+#         print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
+#         try:
+#             for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
+#                 os.remove(f)
+#             os.rmdir(temp_pipeline_dir)
+#             print("🧹 Cleanup successful.")
+#         except Exception as e:
+#             print(f"⚠️ Cleanup failed: {e}")
+#     total_time = time.time() - overall_start
+#     print("\n" + "#" * 80)
+#     print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
+#     print("#" * 80)
+#     return final_result
 def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
     if not os.path.exists(input_pdf_path):
         print(f"❌ ERROR: File not found: {input_pdf_path}")
     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
     if structured_intermediate_output_path is None:
         structured_intermediate_output_path = os.path.join(
             temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
         )
     final_result = None
     try:
             print("❌ FAILED at Step 2: Inference returned no data.")
             return None
         with open(raw_output_path, 'w', encoding='utf-8') as f:
             json.dump(page_raw_predictions_list, f, indent=4)
         print(f"✅ Step 2 Complete ({time.time() - p2_start:.2f}s)")
             print("❌ FAILED at Step 3: BIO conversion failed.")
             return None
         print("... Correcting misalignments and linking context ...")
         structured_data_list = correct_misaligned_options(structured_data_list)
         structured_data_list = process_context_linking(structured_data_list)
             return None
         print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
+        # --- Phase 4.5: Question Type Classification ---
         print(f"\n[Step 4.5/5] Adding Question Type Classification...")
         p4_5_start = time.time()
         final_result = add_question_type_validation(final_result)
         print(f"✅ Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
         # --- Phase 5: Hierarchical Tagging ---
         print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
         else:
             print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
+        # ============================================================
+        # 🔧 NEW STEP: FILTER OUT METADATA ENTRIES
+        # ============================================================
+        print(f"\n[Post-Processing] Removing METADATA entries...")
+        initial_count = len(final_result)
+        final_result = [item for item in final_result if item.get('type') != 'METADATA']
+        removed_count = initial_count - len(final_result)
+        print(f"✅ Removed {removed_count} METADATA entries. {len(final_result)} questions remain.")
+        # ============================================================
     except Exception as e:
         print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
         print(f"Error Message: {str(e)}")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Complete Pipeline")
     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")