Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +3 -83
working_yolo_pipeline.py
CHANGED
|
@@ -2047,8 +2047,9 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
|
|
| 2047 |
|
| 2048 |
|
| 2049 |
|
| 2050 |
-
def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
|
| 2051 |
-
|
|
|
|
| 2052 |
if not os.path.exists(input_pdf_path): return None
|
| 2053 |
|
| 2054 |
print("\n" + "#" * 80)
|
|
@@ -2118,29 +2119,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
|
|
| 2118 |
return final_result
|
| 2119 |
|
| 2120 |
|
| 2121 |
-
|
| 2122 |
-
|
| 2123 |
-
# if __name__ == "__main__":
|
| 2124 |
-
# parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2125 |
-
# parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
| 2126 |
-
# parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
|
| 2127 |
-
# parser.add_argument("--ls_output_path", type=str, default=None, help="Label Studio Output Path")
|
| 2128 |
-
# args = parser.parse_args()
|
| 2129 |
-
|
| 2130 |
-
# pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
|
| 2131 |
-
# final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
|
| 2132 |
-
# ls_output_path = os.path.abspath(
|
| 2133 |
-
# args.ls_output_path if args.ls_output_path else f"{pdf_name}_label_studio_tasks.json")
|
| 2134 |
-
|
| 2135 |
-
# final_json_data = run_document_pipeline(args.input_pdf, args.layoutlmv3_model_path, ls_output_path)
|
| 2136 |
-
|
| 2137 |
-
# if final_json_data:
|
| 2138 |
-
# with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2139 |
-
# json.dump(final_json_data, f, indent=2, ensure_ascii=False)
|
| 2140 |
-
# print(f"\n✅ Final Data Saved: {final_output_path}")
|
| 2141 |
-
# else:
|
| 2142 |
-
# print("\n❌ Pipeline Failed.")
|
| 2143 |
-
# sys.exit(1)
|
| 2144 |
|
| 2145 |
|
| 2146 |
|
|
@@ -2180,61 +2158,3 @@ if __name__ == "__main__":
|
|
| 2180 |
|
| 2181 |
|
| 2182 |
|
| 2183 |
-
|
| 2184 |
-
|
| 2185 |
-
|
| 2186 |
-
# if __name__ == "__main__":
|
| 2187 |
-
# # Ensure 'json', 'argparse', 'os', and 'sys' are imported at the top of your script
|
| 2188 |
-
# # import json
|
| 2189 |
-
# # import argparse
|
| 2190 |
-
# # import os
|
| 2191 |
-
# # import sys
|
| 2192 |
-
#
|
| 2193 |
-
# parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2194 |
-
# parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
| 2195 |
-
# parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
|
| 2196 |
-
# parser.add_argument("--ls_output_path", type=str, default=None, help="Label Studio Output Path")
|
| 2197 |
-
# # --- ADDED ARGUMENT FOR DEBUGGING ---
|
| 2198 |
-
# parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
|
| 2199 |
-
# help="Debug path for raw BIO tag predictions (JSON).")
|
| 2200 |
-
# # ------------------------------------
|
| 2201 |
-
# args = parser.parse_args()
|
| 2202 |
-
#
|
| 2203 |
-
# pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
|
| 2204 |
-
# final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
|
| 2205 |
-
# ls_output_path = os.path.abspath(
|
| 2206 |
-
# args.ls_output_path if args.ls_output_path else f"{pdf_name}_label_studio_tasks.json")
|
| 2207 |
-
# # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
|
| 2208 |
-
# # raw_predictions_output_path = os.path.abspath(
|
| 2209 |
-
# # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
|
| 2210 |
-
# # ---------------------------------------------
|
| 2211 |
-
#
|
| 2212 |
-
# # --- UPDATED FUNCTION CALL ---
|
| 2213 |
-
# final_json_data = run_document_pipeline(
|
| 2214 |
-
# args.input_pdf,
|
| 2215 |
-
# args.layoutlmv3_model_path,
|
| 2216 |
-
# ls_output_path,
|
| 2217 |
-
# # raw_predictions_output_path # Pass the new argument
|
| 2218 |
-
# )
|
| 2219 |
-
# # -----------------------------
|
| 2220 |
-
#
|
| 2221 |
-
# # 🛑 CRITICAL FIX: CUSTOM JSON SAVING TO REMOVE DOUBLE BACKSLASHES 🛑
|
| 2222 |
-
# if final_json_data:
|
| 2223 |
-
# # 1. Dump the Python object to a standard JSON string.
|
| 2224 |
-
# # This uses json.dumps which correctly escapes single backslashes ('\') to ('\\').
|
| 2225 |
-
# json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
|
| 2226 |
-
#
|
| 2227 |
-
# # 2. **UNDO ESCAPING:** Replace every instance of the JSON-escaped backslash ('\\')
|
| 2228 |
-
# # with a single literal backslash ('\'). This forces the file content to be correct for LaTeX.
|
| 2229 |
-
# final_output_content = json_str.replace('\\\\', '\\')
|
| 2230 |
-
#
|
| 2231 |
-
# # 3. Write the corrected string content to the file.
|
| 2232 |
-
# with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2233 |
-
# f.write(final_output_content)
|
| 2234 |
-
#
|
| 2235 |
-
# print(f"\n✅ Final Data Saved: {final_output_path}")
|
| 2236 |
-
# else:
|
| 2237 |
-
# print("\n❌ Pipeline Failed.")
|
| 2238 |
-
# sys.exit(1)
|
| 2239 |
-
|
| 2240 |
-
|
|
|
|
| 2047 |
|
| 2048 |
|
| 2049 |
|
| 2050 |
+
# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
|
| 2051 |
+
# List[Dict[str, Any]]]:
|
| 2052 |
+
def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
| 2053 |
if not os.path.exists(input_pdf_path): return None
|
| 2054 |
|
| 2055 |
print("\n" + "#" * 80)
|
|
|
|
| 2119 |
return final_result
|
| 2120 |
|
| 2121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2122 |
|
| 2123 |
|
| 2124 |
|
|
|
|
| 2158 |
|
| 2159 |
|
| 2160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|