Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +26 -17
working_yolo_pipeline.py
CHANGED
|
@@ -2058,19 +2058,27 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
|
|
| 2058 |
# ============================================================================
|
| 2059 |
|
| 2060 |
|
| 2061 |
-
|
| 2062 |
def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
|
| 2063 |
"""
|
| 2064 |
-
Wraps an image into a temporary PyMuPDF document/page.
|
| 2065 |
-
|
| 2066 |
-
to work on images exactly as they do on PDFs.
|
| 2067 |
"""
|
| 2068 |
-
|
| 2069 |
-
|
| 2070 |
-
|
| 2071 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2072 |
return doc, doc[0]
|
| 2073 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2074 |
def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
| 2075 |
"""
|
| 2076 |
Modified pipeline that handles both PDFs and Images, running YOLO,
|
|
@@ -2079,8 +2087,9 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
|
| 2079 |
# 1. INITIALIZE YOLO
|
| 2080 |
yolo_model = YOLO(WEIGHTS_PATH)
|
| 2081 |
|
| 2082 |
-
# 2. DETECT FILE
|
| 2083 |
-
|
|
|
|
| 2084 |
is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
|
| 2085 |
|
| 2086 |
all_pages_data = []
|
|
@@ -2089,14 +2098,13 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
|
| 2089 |
try:
|
| 2090 |
if is_image:
|
| 2091 |
print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
|
|
|
|
| 2092 |
doc, page = load_image_as_fitz_page(input_path)
|
| 2093 |
|
| 2094 |
-
# Render for YOLO
|
| 2095 |
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2096 |
img_np = pixmap_to_numpy(pix)
|
| 2097 |
|
| 2098 |
-
# Since an image has no native text layer, preprocess_and_ocr_page
|
| 2099 |
-
# will automatically use Tesseract OCR fallback as intended.
|
| 2100 |
page_data, _ = preprocess_and_ocr_page(
|
| 2101 |
img_np, yolo_model, input_path, 0, page, pdf_name
|
| 2102 |
)
|
|
@@ -2128,23 +2136,21 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
|
| 2128 |
for p_data in all_pages_data:
|
| 2129 |
sequential_blocks.extend(p_data.get('blocks', []))
|
| 2130 |
|
| 2131 |
-
# --- 4. STARTING LAYOUTLMV3 INFERENCE
|
| 2132 |
print("\n" + "=" * 80)
|
| 2133 |
print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
|
| 2134 |
print("=" * 80)
|
| 2135 |
|
| 2136 |
-
# (Inlining your existing LayoutLMv3 inference logic)
|
| 2137 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 2138 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 2139 |
|
| 2140 |
-
#
|
| 2141 |
model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
|
| 2142 |
checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
|
| 2143 |
model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
|
| 2144 |
model.to(device)
|
| 2145 |
model.eval()
|
| 2146 |
|
| 2147 |
-
# Run inference on sequential_blocks...
|
| 2148 |
final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
|
| 2149 |
|
| 2150 |
# 5. POST-PROCESS CLASSIFICATION
|
|
@@ -2156,6 +2162,9 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
|
| 2156 |
return final_result
|
| 2157 |
|
| 2158 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
| 2159 |
print(f"❌ FATAL ERROR in pipeline: {e}")
|
| 2160 |
return None
|
| 2161 |
|
|
|
|
| 2058 |
# ============================================================================
|
| 2059 |
|
| 2060 |
|
|
|
|
| 2061 |
def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
|
| 2062 |
"""
|
| 2063 |
+
Wraps an image into a temporary PyMuPDF document/page safely.
|
| 2064 |
+
Uses an in-memory buffer to bypass 'encoder pdf not available' errors.
|
|
|
|
| 2065 |
"""
|
| 2066 |
+
# 1. Use PIL to open the image and ensure it's in RGB mode
|
| 2067 |
+
img = Image.open(image_path).convert("RGB")
|
| 2068 |
+
|
| 2069 |
+
# 2. Use a bytes buffer to save the image as a PDF via PIL's engine
|
| 2070 |
+
pdf_stream = io.BytesIO()
|
| 2071 |
+
img.save(pdf_stream, format="PDF")
|
| 2072 |
+
pdf_stream.seek(0)
|
| 2073 |
+
|
| 2074 |
+
# 3. Open that PDF stream with PyMuPDF
|
| 2075 |
+
doc = fitz.open("pdf", pdf_stream.read())
|
| 2076 |
return doc, doc[0]
|
| 2077 |
|
| 2078 |
+
|
| 2079 |
+
|
| 2080 |
+
|
| 2081 |
+
|
| 2082 |
def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
|
| 2083 |
"""
|
| 2084 |
Modified pipeline that handles both PDFs and Images, running YOLO,
|
|
|
|
| 2087 |
# 1. INITIALIZE YOLO
|
| 2088 |
yolo_model = YOLO(WEIGHTS_PATH)
|
| 2089 |
|
| 2090 |
+
# 2. DETECT FILE TYPE
|
| 2091 |
+
# FIX: [1] added to get the extension string from the (root, ext) tuple
|
| 2092 |
+
ext = os.path.splitext(input_path)[1].lower()
|
| 2093 |
is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
|
| 2094 |
|
| 2095 |
all_pages_data = []
|
|
|
|
| 2098 |
try:
|
| 2099 |
if is_image:
|
| 2100 |
print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
|
| 2101 |
+
# Use the corrected helper function defined above
|
| 2102 |
doc, page = load_image_as_fitz_page(input_path)
|
| 2103 |
|
| 2104 |
+
# Render for YOLO
|
| 2105 |
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 2106 |
img_np = pixmap_to_numpy(pix)
|
| 2107 |
|
|
|
|
|
|
|
| 2108 |
page_data, _ = preprocess_and_ocr_page(
|
| 2109 |
img_np, yolo_model, input_path, 0, page, pdf_name
|
| 2110 |
)
|
|
|
|
| 2136 |
for p_data in all_pages_data:
|
| 2137 |
sequential_blocks.extend(p_data.get('blocks', []))
|
| 2138 |
|
| 2139 |
+
# --- 4. STARTING LAYOUTLMV3 INFERENCE ---
|
| 2140 |
print("\n" + "=" * 80)
|
| 2141 |
print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
|
| 2142 |
print("=" * 80)
|
| 2143 |
|
|
|
|
| 2144 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 2145 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 2146 |
|
| 2147 |
+
# Note: Ensure LayoutLMv3ForTokenClassification is defined in your script
|
| 2148 |
model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
|
| 2149 |
checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
|
| 2150 |
model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
|
| 2151 |
model.to(device)
|
| 2152 |
model.eval()
|
| 2153 |
|
|
|
|
| 2154 |
final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
|
| 2155 |
|
| 2156 |
# 5. POST-PROCESS CLASSIFICATION
|
|
|
|
| 2162 |
return final_result
|
| 2163 |
|
| 2164 |
except Exception as e:
|
| 2165 |
+
# Improved error logging to catch exactly where it fails
|
| 2166 |
+
import traceback
|
| 2167 |
+
traceback.print_exc()
|
| 2168 |
print(f"❌ FATAL ERROR in pipeline: {e}")
|
| 2169 |
return None
|
| 2170 |
|