Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +52 -6
working_yolo_pipeline.py
CHANGED
|
@@ -2476,6 +2476,44 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
|
|
| 2476 |
|
| 2477 |
|
| 2478 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2479 |
if __name__ == "__main__":
|
| 2480 |
parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2481 |
parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
|
@@ -2490,7 +2528,7 @@ if __name__ == "__main__":
|
|
| 2490 |
pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
|
| 2491 |
final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
|
| 2492 |
|
| 2493 |
-
# --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
|
| 2494 |
# raw_predictions_output_path = os.path.abspath(
|
| 2495 |
# args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
|
| 2496 |
# ---------------------------------------------
|
|
@@ -2501,13 +2539,21 @@ if __name__ == "__main__":
|
|
| 2501 |
args.layoutlmv3_model_path )
|
| 2502 |
# -----------------------------
|
| 2503 |
|
|
|
|
| 2504 |
if final_json_data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2505 |
with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2506 |
-
|
|
|
|
| 2507 |
print(f"\n✅ Final Data Saved: {final_output_path}")
|
| 2508 |
else:
|
| 2509 |
print("\n❌ Pipeline Failed.")
|
| 2510 |
-
sys.exit(1)
|
| 2511 |
-
|
| 2512 |
-
|
| 2513 |
-
|
|
|
|
| 2476 |
|
| 2477 |
|
| 2478 |
|
| 2479 |
+
# if __name__ == "__main__":
|
| 2480 |
+
# parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2481 |
+
# parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
| 2482 |
+
# parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
|
| 2483 |
+
|
| 2484 |
+
# # --- ADDED ARGUMENT FOR DEBUGGING ---
|
| 2485 |
+
# parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
|
| 2486 |
+
# help="Debug path for raw BIO tag predictions (JSON).")
|
| 2487 |
+
# # ------------------------------------
|
| 2488 |
+
# args = parser.parse_args()
|
| 2489 |
+
|
| 2490 |
+
# pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
|
| 2491 |
+
# final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
|
| 2492 |
+
|
| 2493 |
+
# # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
|
| 2494 |
+
# # raw_predictions_output_path = os.path.abspath(
|
| 2495 |
+
# # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
|
| 2496 |
+
# # ---------------------------------------------
|
| 2497 |
+
|
| 2498 |
+
# # --- UPDATED FUNCTION CALL ---
|
| 2499 |
+
# final_json_data = run_document_pipeline(
|
| 2500 |
+
# args.input_pdf,
|
| 2501 |
+
# args.layoutlmv3_model_path )
|
| 2502 |
+
# # -----------------------------
|
| 2503 |
+
|
| 2504 |
+
|
| 2505 |
+
|
| 2506 |
+
# if final_json_data:
|
| 2507 |
+
# with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2508 |
+
# json.dump(final_json_data, f, indent=2, ensure_ascii=False)
|
| 2509 |
+
# print(f"\n✅ Final Data Saved: {final_output_path}")
|
| 2510 |
+
# else:
|
| 2511 |
+
# print("\n❌ Pipeline Failed.")
|
| 2512 |
+
# sys.exit(1)
|
| 2513 |
+
|
| 2514 |
+
|
| 2515 |
+
|
| 2516 |
+
|
| 2517 |
if __name__ == "__main__":
|
| 2518 |
parser = argparse.ArgumentParser(description="Complete Pipeline")
|
| 2519 |
parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
|
|
|
|
| 2528 |
pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
|
| 2529 |
final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
|
| 2530 |
|
| 2531 |
+
# --- CALCULATE RAW PREDICTIONS OUTPUT PATH (Kept commented as per original script) ---
|
| 2532 |
# raw_predictions_output_path = os.path.abspath(
|
| 2533 |
# args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
|
| 2534 |
# ---------------------------------------------
|
|
|
|
| 2539 |
args.layoutlmv3_model_path )
|
| 2540 |
# -----------------------------
|
| 2541 |
|
| 2542 |
+
# 🛑 CRITICAL FINAL FIX: CUSTOM JSON SAVING TO REMOVE ESCAPING 🛑
|
| 2543 |
if final_json_data:
|
| 2544 |
+
# 1. Dump the Python object to a standard JSON string.
|
| 2545 |
+
# This converts the in-memory single backslash ('\') into the JSON escaped double backslash ('\\').
|
| 2546 |
+
json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
|
| 2547 |
+
|
| 2548 |
+
# 2. **UNDO ESCAPING:** The string currently contains '\\' where you need '\' for LaTeX.
|
| 2549 |
+
# We replace the JSON-escaped double backslash ('\\') with a single literal backslash ('\').
|
| 2550 |
+
final_output_content = json_str.replace('\\\\', '\\')
|
| 2551 |
+
|
| 2552 |
+
# 3. Write the corrected string content to the file.
|
| 2553 |
with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2554 |
+
f.write(final_output_content)
|
| 2555 |
+
|
| 2556 |
print(f"\n✅ Final Data Saved: {final_output_path}")
|
| 2557 |
else:
|
| 2558 |
print("\n❌ Pipeline Failed.")
|
| 2559 |
+
sys.exit(1)
|
|
|
|
|
|
|
|
|