Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +23 -23
working_yolo_pipeline.py
CHANGED
|
@@ -2814,13 +2814,11 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
|
|
| 2814 |
|
| 2815 |
|
| 2816 |
|
| 2817 |
-
|
| 2818 |
-
|
| 2819 |
def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
|
| 2820 |
"""
|
| 2821 |
Wraps a standard image file into a single-page PyMuPDF Document.
|
| 2822 |
-
This
|
| 2823 |
-
|
| 2824 |
"""
|
| 2825 |
img = Image.open(image_path)
|
| 2826 |
# Convert image to a PDF stream in memory
|
|
@@ -2830,53 +2828,47 @@ def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
|
|
| 2830 |
|
| 2831 |
def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
| 2832 |
"""
|
| 2833 |
-
Main pipeline
|
| 2834 |
"""
|
| 2835 |
-
#
|
| 2836 |
yolo_model = YOLO(WEIGHTS_PATH)
|
| 2837 |
|
| 2838 |
-
#
|
| 2839 |
ext = os.path.splitext(input_path)[1].lower()
|
| 2840 |
is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
|
| 2841 |
|
| 2842 |
all_pages_data = []
|
| 2843 |
|
|
|
|
| 2844 |
if is_image:
|
| 2845 |
print(f"πΈ Image detected: {input_path}. Initializing Single-Page Pipeline.")
|
| 2846 |
-
|
| 2847 |
-
# 2. IMAGE BRANCH: Wrap image into a fitz page
|
| 2848 |
doc, page = load_image_as_fitz_page(input_path)
|
| 2849 |
|
| 2850 |
-
#
|
|
|
|
| 2851 |
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2852 |
img_np = pixmap_to_numpy(pix)
|
| 2853 |
|
| 2854 |
-
# 3. PROCESS THE PAGE
|
| 2855 |
-
# Because the 'page' has no native text, the Tesseract OCR fallback
|
| 2856 |
-
# in preprocess_and_ocr_page will trigger automatically.
|
| 2857 |
page_data, _ = preprocess_and_ocr_page(
|
| 2858 |
img_np,
|
| 2859 |
yolo_model,
|
| 2860 |
input_path,
|
| 2861 |
-
0, # Page
|
| 2862 |
page,
|
| 2863 |
os.path.basename(input_path)
|
| 2864 |
)
|
| 2865 |
-
|
| 2866 |
if page_data:
|
| 2867 |
all_pages_data.append(page_data)
|
| 2868 |
doc.close()
|
| 2869 |
|
| 2870 |
else:
|
| 2871 |
-
#
|
| 2872 |
try:
|
| 2873 |
doc = fitz.open(input_path)
|
| 2874 |
print(f"π Processing PDF with {len(doc)} pages: {input_path}")
|
| 2875 |
|
| 2876 |
for page_index in range(len(doc)):
|
| 2877 |
page = doc[page_index]
|
| 2878 |
-
|
| 2879 |
-
# Render page at 2.0x scale (consistent with your original script)
|
| 2880 |
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2881 |
img_np = pixmap_to_numpy(pix)
|
| 2882 |
|
|
@@ -2888,35 +2880,43 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
|
| 2888 |
page,
|
| 2889 |
os.path.basename(input_path)
|
| 2890 |
)
|
| 2891 |
-
|
| 2892 |
if page_data:
|
| 2893 |
all_pages_data.append(page_data)
|
| 2894 |
-
|
| 2895 |
doc.close()
|
| 2896 |
except Exception as e:
|
| 2897 |
print(f"β Error opening PDF {input_path}: {e}")
|
| 2898 |
return None
|
| 2899 |
|
| 2900 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2901 |
# Sequence all blocks from all pages (or the single image page)
|
| 2902 |
sequential_blocks = []
|
| 2903 |
for p_data in all_pages_data:
|
| 2904 |
sequential_blocks.extend(p_data.get('blocks', []))
|
| 2905 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2906 |
# Run LayoutLMv3 Inference on the gathered blocks
|
| 2907 |
final_structured_data = run_layoutlmv3_inference_on_blocks(
|
| 2908 |
sequential_blocks,
|
| 2909 |
layoutlmv3_model_path
|
| 2910 |
)
|
| 2911 |
|
| 2912 |
-
# Run
|
| 2913 |
classifier = HierarchicalClassifier()
|
| 2914 |
if classifier.load_models():
|
| 2915 |
final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
|
|
|
|
|
|
|
|
|
|
| 2916 |
|
| 2917 |
return final_structured_data
|
| 2918 |
|
| 2919 |
-
|
| 2920 |
|
| 2921 |
#================================================================================
|
| 2922 |
# --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
|
|
|
|
| 2814 |
|
| 2815 |
|
| 2816 |
|
|
|
|
|
|
|
| 2817 |
def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
|
| 2818 |
"""
|
| 2819 |
Wraps a standard image file into a single-page PyMuPDF Document.
|
| 2820 |
+
This ensures it can be processed by your existing fitz-based functions
|
| 2821 |
+
(coordinate scaling, column detection, etc.) exactly as before.
|
| 2822 |
"""
|
| 2823 |
img = Image.open(image_path)
|
| 2824 |
# Convert image to a PDF stream in memory
|
|
|
|
| 2828 |
|
| 2829 |
def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
| 2830 |
"""
|
| 2831 |
+
Main pipeline modified to handle both PDF and Image files.
|
| 2832 |
"""
|
| 2833 |
+
# 1. INITIALIZE MODELS (Preserving original logic)
|
| 2834 |
yolo_model = YOLO(WEIGHTS_PATH)
|
| 2835 |
|
| 2836 |
+
# 2. DETECT FILE TYPE
|
| 2837 |
ext = os.path.splitext(input_path)[1].lower()
|
| 2838 |
is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
|
| 2839 |
|
| 2840 |
all_pages_data = []
|
| 2841 |
|
| 2842 |
+
# 3. BRANCH LOGIC: IMAGE VS PDF
|
| 2843 |
if is_image:
|
| 2844 |
print(f"πΈ Image detected: {input_path}. Initializing Single-Page Pipeline.")
|
|
|
|
|
|
|
| 2845 |
doc, page = load_image_as_fitz_page(input_path)
|
| 2846 |
|
| 2847 |
+
# Process as Page 0. Because there is no native text, your existing
|
| 2848 |
+
# Tesseract fallback will naturally trigger to read the content.
|
| 2849 |
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2850 |
img_np = pixmap_to_numpy(pix)
|
| 2851 |
|
|
|
|
|
|
|
|
|
|
| 2852 |
page_data, _ = preprocess_and_ocr_page(
|
| 2853 |
img_np,
|
| 2854 |
yolo_model,
|
| 2855 |
input_path,
|
| 2856 |
+
0, # Page 0
|
| 2857 |
page,
|
| 2858 |
os.path.basename(input_path)
|
| 2859 |
)
|
|
|
|
| 2860 |
if page_data:
|
| 2861 |
all_pages_data.append(page_data)
|
| 2862 |
doc.close()
|
| 2863 |
|
| 2864 |
else:
|
| 2865 |
+
# Standard PDF Processing Loop
|
| 2866 |
try:
|
| 2867 |
doc = fitz.open(input_path)
|
| 2868 |
print(f"π Processing PDF with {len(doc)} pages: {input_path}")
|
| 2869 |
|
| 2870 |
for page_index in range(len(doc)):
|
| 2871 |
page = doc[page_index]
|
|
|
|
|
|
|
| 2872 |
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2873 |
img_np = pixmap_to_numpy(pix)
|
| 2874 |
|
|
|
|
| 2880 |
page,
|
| 2881 |
os.path.basename(input_path)
|
| 2882 |
)
|
|
|
|
| 2883 |
if page_data:
|
| 2884 |
all_pages_data.append(page_data)
|
|
|
|
| 2885 |
doc.close()
|
| 2886 |
except Exception as e:
|
| 2887 |
print(f"β Error opening PDF {input_path}: {e}")
|
| 2888 |
return None
|
| 2889 |
|
| 2890 |
+
# 4. CONTINUE EXACTLY AS BEFORE: Gathering and Inference
|
| 2891 |
+
if not all_pages_data:
|
| 2892 |
+
print("β No data extracted from document.")
|
| 2893 |
+
return None
|
| 2894 |
+
|
| 2895 |
# Sequence all blocks from all pages (or the single image page)
|
| 2896 |
sequential_blocks = []
|
| 2897 |
for p_data in all_pages_data:
|
| 2898 |
sequential_blocks.extend(p_data.get('blocks', []))
|
| 2899 |
|
| 2900 |
+
print("\n" + "=" * 80)
|
| 2901 |
+
print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
|
| 2902 |
+
print("=" * 80)
|
| 2903 |
+
|
| 2904 |
# Run LayoutLMv3 Inference on the gathered blocks
|
| 2905 |
final_structured_data = run_layoutlmv3_inference_on_blocks(
|
| 2906 |
sequential_blocks,
|
| 2907 |
layoutlmv3_model_path
|
| 2908 |
)
|
| 2909 |
|
| 2910 |
+
# Run Hierarchical classification (Subject/Concept tags)
|
| 2911 |
classifier = HierarchicalClassifier()
|
| 2912 |
if classifier.load_models():
|
| 2913 |
final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
|
| 2914 |
+
print("β
Classification complete. Tags added.")
|
| 2915 |
+
else:
|
| 2916 |
+
print("β Classifier not found. Returning untagged data.")
|
| 2917 |
|
| 2918 |
return final_structured_data
|
| 2919 |
|
|
|
|
| 2920 |
|
| 2921 |
#================================================================================
|
| 2922 |
# --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
|