Spaces:
Sleeping
Sleeping
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +54 -5
working_yolo_pipeline.py
CHANGED
|
@@ -2514,6 +2514,53 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
|
|
| 2514 |
|
| 2515 |
|
| 2516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2517 |
if __name__ == "__main__":
|
| 2518 |
parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2519 |
parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
|
@@ -2539,15 +2586,17 @@ if __name__ == "__main__":
|
|
| 2539 |
args.layoutlmv3_model_path )
|
| 2540 |
# -----------------------------
|
| 2541 |
|
| 2542 |
-
# π CRITICAL FINAL FIX: CUSTOM JSON SAVING
|
| 2543 |
if final_json_data:
|
| 2544 |
# 1. Dump the Python object to a standard JSON string.
|
| 2545 |
-
# This converts the in-memory
|
|
|
|
| 2546 |
json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
|
| 2547 |
|
| 2548 |
-
# 2. **UNDO ESCAPING:**
|
| 2549 |
-
#
|
| 2550 |
-
|
|
|
|
| 2551 |
|
| 2552 |
# 3. Write the corrected string content to the file.
|
| 2553 |
with open(final_output_path, 'w', encoding='utf-8') as f:
|
|
|
|
| 2514 |
|
| 2515 |
|
| 2516 |
|
| 2517 |
+
# if __name__ == "__main__":
|
| 2518 |
+
# parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2519 |
+
# parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
| 2520 |
+
# parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
|
| 2521 |
+
|
| 2522 |
+
# # --- ADDED ARGUMENT FOR DEBUGGING ---
|
| 2523 |
+
# parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
|
| 2524 |
+
# help="Debug path for raw BIO tag predictions (JSON).")
|
| 2525 |
+
# # ------------------------------------
|
| 2526 |
+
# args = parser.parse_args()
|
| 2527 |
+
|
| 2528 |
+
# pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
|
| 2529 |
+
# final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
|
| 2530 |
+
|
| 2531 |
+
# # --- CALCULATE RAW PREDICTIONS OUTPUT PATH (Kept commented as per original script) ---
|
| 2532 |
+
# # raw_predictions_output_path = os.path.abspath(
|
| 2533 |
+
# # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
|
| 2534 |
+
# # ---------------------------------------------
|
| 2535 |
+
|
| 2536 |
+
# # --- UPDATED FUNCTION CALL ---
|
| 2537 |
+
# final_json_data = run_document_pipeline(
|
| 2538 |
+
# args.input_pdf,
|
| 2539 |
+
# args.layoutlmv3_model_path )
|
| 2540 |
+
# # -----------------------------
|
| 2541 |
+
|
| 2542 |
+
# # π CRITICAL FINAL FIX: CUSTOM JSON SAVING TO REMOVE ESCAPING π
|
| 2543 |
+
# if final_json_data:
|
| 2544 |
+
# # 1. Dump the Python object to a standard JSON string.
|
| 2545 |
+
# # This converts the in-memory single backslash ('\') into the JSON escaped double backslash ('\\').
|
| 2546 |
+
# json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
|
| 2547 |
+
|
| 2548 |
+
# # 2. **UNDO ESCAPING:** The string currently contains '\\' where you need '\' for LaTeX.
|
| 2549 |
+
# # We replace the JSON-escaped double backslash ('\\') with a single literal backslash ('\').
|
| 2550 |
+
# final_output_content = json_str.replace('\\\\', '\\')
|
| 2551 |
+
|
| 2552 |
+
# # 3. Write the corrected string content to the file.
|
| 2553 |
+
# with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2554 |
+
# f.write(final_output_content)
|
| 2555 |
+
|
| 2556 |
+
# print(f"\nβ
Final Data Saved: {final_output_path}")
|
| 2557 |
+
# else:
|
| 2558 |
+
# print("\nβ Pipeline Failed.")
|
| 2559 |
+
# sys.exit(1)
|
| 2560 |
+
|
| 2561 |
+
|
| 2562 |
+
|
| 2563 |
+
|
| 2564 |
if __name__ == "__main__":
|
| 2565 |
parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2566 |
parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
|
|
|
| 2586 |
args.layoutlmv3_model_path )
|
| 2587 |
# -----------------------------
|
| 2588 |
|
| 2589 |
+
# π CRITICAL FINAL FIX: AGGRESSIVE CUSTOM JSON SAVING π
|
| 2590 |
if final_json_data:
|
| 2591 |
# 1. Dump the Python object to a standard JSON string.
|
| 2592 |
+
# This converts the in-memory double backslash ('\\') into a quadruple backslash ('\\\\')
|
| 2593 |
+
# in the raw json_str string content.
|
| 2594 |
json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
|
| 2595 |
|
| 2596 |
+
# 2. **AGGRESSIVE UNDO ESCAPING:** We assume we have quadruple backslashes and
|
| 2597 |
+
# replace them with the double backslashes needed for the LaTeX command to work.
|
| 2598 |
+
# This operation essentially replaces four literal backslashes with two literal backslashes.
|
| 2599 |
+
final_output_content = json_str.replace('\\\\\\\\', '\\\\')
|
| 2600 |
|
| 2601 |
# 3. Write the corrected string content to the file.
|
| 2602 |
with open(final_output_path, 'w', encoding='utf-8') as f:
|