Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +147 -43
working_yolo_pipeline.py
CHANGED
|
@@ -2759,58 +2759,162 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
|
|
| 2759 |
|
| 2760 |
|
| 2761 |
|
| 2762 |
-
# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
|
| 2763 |
-
# List[Dict[str, Any]]]:
|
| 2764 |
-
def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
| 2765 |
-
|
| 2766 |
|
| 2767 |
-
|
| 2768 |
-
|
| 2769 |
-
|
| 2770 |
|
| 2771 |
-
|
| 2772 |
-
|
| 2773 |
-
|
| 2774 |
|
| 2775 |
-
|
| 2776 |
-
|
| 2777 |
-
|
| 2778 |
|
| 2779 |
-
|
| 2780 |
-
|
| 2781 |
-
|
| 2782 |
-
|
| 2783 |
-
|
| 2784 |
|
| 2785 |
-
|
| 2786 |
-
|
| 2787 |
-
|
| 2788 |
-
|
| 2789 |
-
|
| 2790 |
-
|
| 2791 |
-
|
| 2792 |
-
|
| 2793 |
-
|
| 2794 |
-
|
| 2795 |
-
|
| 2796 |
-
|
| 2797 |
-
|
| 2798 |
-
|
| 2799 |
-
|
| 2800 |
-
|
| 2801 |
-
|
| 2802 |
-
|
| 2803 |
-
|
| 2804 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2805 |
)
|
| 2806 |
-
|
| 2807 |
-
|
| 2808 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2809 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2810 |
|
| 2811 |
-
|
| 2812 |
-
|
|
|
|
|
|
|
|
|
|
| 2813 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2814 |
|
| 2815 |
|
| 2816 |
|
|
|
|
| 2759 |
|
| 2760 |
|
| 2761 |
|
| 2762 |
+
# # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
|
| 2763 |
+
# # List[Dict[str, Any]]]:
|
| 2764 |
+
# def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
| 2765 |
+
# if not os.path.exists(input_pdf_path): return None
|
| 2766 |
|
| 2767 |
+
# print("\n" + "#" * 80)
|
| 2768 |
+
# print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
|
| 2769 |
+
# print("#" * 80)
|
| 2770 |
|
| 2771 |
+
# pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
|
| 2772 |
+
# temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
|
| 2773 |
+
# os.makedirs(temp_pipeline_dir, exist_ok=True)
|
| 2774 |
|
| 2775 |
+
# preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
|
| 2776 |
+
# raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
|
| 2777 |
+
# structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
|
| 2778 |
|
| 2779 |
+
# final_result = None
|
| 2780 |
+
# try:
|
| 2781 |
+
# # Phase 1: Preprocessing with YOLO First + Masking
|
| 2782 |
+
# preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
|
| 2783 |
+
# if not preprocessed_json_path_out: return None
|
| 2784 |
|
| 2785 |
+
# # Phase 2: Inference
|
| 2786 |
+
# page_raw_predictions_list = run_inference_and_get_raw_words(
|
| 2787 |
+
# input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
|
| 2788 |
+
# )
|
| 2789 |
+
# if not page_raw_predictions_list: return None
|
| 2790 |
+
|
| 2791 |
+
# # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
|
| 2792 |
+
# # Save raw predictions to the temporary file
|
| 2793 |
+
# with open(raw_output_path, 'w', encoding='utf-8') as f:
|
| 2794 |
+
# json.dump(page_raw_predictions_list, f, indent=4)
|
| 2795 |
+
|
| 2796 |
+
# # Explicitly copy/save the raw predictions to the user-specified debug path
|
| 2797 |
+
# # if raw_predictions_output_path:
|
| 2798 |
+
# # shutil.copy(raw_output_path, raw_predictions_output_path)
|
| 2799 |
+
# # print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
|
| 2800 |
+
# # ----------------------------------------
|
| 2801 |
+
|
| 2802 |
+
# # Phase 3: Decoding
|
| 2803 |
+
# structured_data_list = convert_bio_to_structured_json_relaxed(
|
| 2804 |
+
# raw_output_path, structured_intermediate_output_path
|
| 2805 |
+
# )
|
| 2806 |
+
# if not structured_data_list: return None
|
| 2807 |
+
# structured_data_list = correct_misaligned_options(structured_data_list)
|
| 2808 |
+
# structured_data_list = process_context_linking(structured_data_list)
|
| 2809 |
+
|
| 2810 |
+
|
| 2811 |
+
# # Phase 4: Embedding / Equation to LaTeX Conversion
|
| 2812 |
+
# final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
|
| 2813 |
+
|
| 2814 |
+
|
| 2815 |
+
|
| 2816 |
+
|
| 2817 |
+
|
| 2818 |
+
|
| 2819 |
+
def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
|
| 2820 |
+
"""
|
| 2821 |
+
Wraps a standard image file into a single-page PyMuPDF Document.
|
| 2822 |
+
This allows images to be processed by existing PDF-based functions
|
| 2823 |
+
like coordinate scaling and column detection without modification.
|
| 2824 |
+
"""
|
| 2825 |
+
img = Image.open(image_path)
|
| 2826 |
+
# Convert image to a PDF stream in memory
|
| 2827 |
+
pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
|
| 2828 |
+
doc = fitz.open("pdf", pdf_bytes)
|
| 2829 |
+
return doc, doc[0]
|
| 2830 |
+
|
| 2831 |
+
def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
| 2832 |
+
"""
|
| 2833 |
+
Main pipeline entry point modified to handle both PDF and Image files.
|
| 2834 |
+
"""
|
| 2835 |
+
# Initialize YOLO and LayoutLMv3 models (kept from original script)
|
| 2836 |
+
yolo_model = YOLO(WEIGHTS_PATH)
|
| 2837 |
+
|
| 2838 |
+
# 1. DETECT FILE TYPE
|
| 2839 |
+
ext = os.path.splitext(input_path)[1].lower()
|
| 2840 |
+
is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
|
| 2841 |
+
|
| 2842 |
+
all_pages_data = []
|
| 2843 |
+
|
| 2844 |
+
if is_image:
|
| 2845 |
+
print(f"📸 Image detected: {input_path}. Initializing Single-Page Pipeline.")
|
| 2846 |
+
|
| 2847 |
+
# 2. IMAGE BRANCH: Wrap image into a fitz page
|
| 2848 |
+
doc, page = load_image_as_fitz_page(input_path)
|
| 2849 |
+
|
| 2850 |
+
# Render the image for YOLO analysis (consistent with your PDF logic)
|
| 2851 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2852 |
+
img_np = pixmap_to_numpy(pix)
|
| 2853 |
+
|
| 2854 |
+
# 3. PROCESS THE PAGE
|
| 2855 |
+
# Because the 'page' has no native text, the Tesseract OCR fallback
|
| 2856 |
+
# in preprocess_and_ocr_page will trigger automatically.
|
| 2857 |
+
page_data, _ = preprocess_and_ocr_page(
|
| 2858 |
+
img_np,
|
| 2859 |
+
yolo_model,
|
| 2860 |
+
input_path,
|
| 2861 |
+
0, # Page Index 0
|
| 2862 |
+
page,
|
| 2863 |
+
os.path.basename(input_path)
|
| 2864 |
)
|
| 2865 |
+
|
| 2866 |
+
if page_data:
|
| 2867 |
+
all_pages_data.append(page_data)
|
| 2868 |
+
doc.close()
|
| 2869 |
+
|
| 2870 |
+
else:
|
| 2871 |
+
# 4. PDF BRANCH: Standard processing (your original logic)
|
| 2872 |
+
try:
|
| 2873 |
+
doc = fitz.open(input_path)
|
| 2874 |
+
print(f"📄 Processing PDF with {len(doc)} pages: {input_path}")
|
| 2875 |
+
|
| 2876 |
+
for page_index in range(len(doc)):
|
| 2877 |
+
page = doc[page_index]
|
| 2878 |
+
|
| 2879 |
+
# Render page at 2.0x scale (consistent with your original script)
|
| 2880 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2881 |
+
img_np = pixmap_to_numpy(pix)
|
| 2882 |
+
|
| 2883 |
+
page_data, _ = preprocess_and_ocr_page(
|
| 2884 |
+
img_np,
|
| 2885 |
+
yolo_model,
|
| 2886 |
+
input_path,
|
| 2887 |
+
page_index,
|
| 2888 |
+
page,
|
| 2889 |
+
os.path.basename(input_path)
|
| 2890 |
+
)
|
| 2891 |
+
|
| 2892 |
+
if page_data:
|
| 2893 |
+
all_pages_data.append(page_data)
|
| 2894 |
+
|
| 2895 |
+
doc.close()
|
| 2896 |
+
except Exception as e:
|
| 2897 |
+
print(f"❌ Error opening PDF {input_path}: {e}")
|
| 2898 |
+
return None
|
| 2899 |
|
| 2900 |
+
# 5. CONTINUE EXACTLY AS BEFORE: Sequential processing & Inference
|
| 2901 |
+
# Sequence all blocks from all pages (or the single image page)
|
| 2902 |
+
sequential_blocks = []
|
| 2903 |
+
for p_data in all_pages_data:
|
| 2904 |
+
sequential_blocks.extend(p_data.get('blocks', []))
|
| 2905 |
|
| 2906 |
+
# Run LayoutLMv3 Inference on the gathered blocks
|
| 2907 |
+
final_structured_data = run_layoutlmv3_inference_on_blocks(
|
| 2908 |
+
sequential_blocks,
|
| 2909 |
+
layoutlmv3_model_path
|
| 2910 |
+
)
|
| 2911 |
|
| 2912 |
+
# Run Subject/Concept classification (as implemented in your original script)
|
| 2913 |
+
classifier = HierarchicalClassifier()
|
| 2914 |
+
if classifier.load_models():
|
| 2915 |
+
final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
|
| 2916 |
+
|
| 2917 |
+
return final_structured_data
|
| 2918 |
|
| 2919 |
|
| 2920 |
|