heerjtdev commited on
Commit
619a08a
Β·
verified Β·
1 Parent(s): 184edf5

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +54 -5
working_yolo_pipeline.py CHANGED
@@ -2514,6 +2514,53 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
2514
 
2515
 
2516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2517
  if __name__ == "__main__":
2518
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2519
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
@@ -2539,15 +2586,17 @@ if __name__ == "__main__":
2539
  args.layoutlmv3_model_path )
2540
  # -----------------------------
2541
 
2542
- # πŸ›‘ CRITICAL FINAL FIX: CUSTOM JSON SAVING TO REMOVE ESCAPING πŸ›‘
2543
  if final_json_data:
2544
  # 1. Dump the Python object to a standard JSON string.
2545
- # This converts the in-memory single backslash ('\') into the JSON escaped double backslash ('\\').
 
2546
  json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
2547
 
2548
- # 2. **UNDO ESCAPING:** The string currently contains '\\' where you need '\' for LaTeX.
2549
- # We replace the JSON-escaped double backslash ('\\') with a single literal backslash ('\').
2550
- final_output_content = json_str.replace('\\\\', '\\')
 
2551
 
2552
  # 3. Write the corrected string content to the file.
2553
  with open(final_output_path, 'w', encoding='utf-8') as f:
 
2514
 
2515
 
2516
 
2517
+ # if __name__ == "__main__":
2518
+ # parser = argparse.ArgumentParser(description="Complete Pipeline")
2519
+ # parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
2520
+ # parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
2521
+
2522
+ # # --- ADDED ARGUMENT FOR DEBUGGING ---
2523
+ # parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
2524
+ # help="Debug path for raw BIO tag predictions (JSON).")
2525
+ # # ------------------------------------
2526
+ # args = parser.parse_args()
2527
+
2528
+ # pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2529
+ # final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2530
+
2531
+ # # --- CALCULATE RAW PREDICTIONS OUTPUT PATH (Kept commented as per original script) ---
2532
+ # # raw_predictions_output_path = os.path.abspath(
2533
+ # # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
2534
+ # # ---------------------------------------------
2535
+
2536
+ # # --- UPDATED FUNCTION CALL ---
2537
+ # final_json_data = run_document_pipeline(
2538
+ # args.input_pdf,
2539
+ # args.layoutlmv3_model_path )
2540
+ # # -----------------------------
2541
+
2542
+ # # πŸ›‘ CRITICAL FINAL FIX: CUSTOM JSON SAVING TO REMOVE ESCAPING πŸ›‘
2543
+ # if final_json_data:
2544
+ # # 1. Dump the Python object to a standard JSON string.
2545
+ # # This converts the in-memory single backslash ('\') into the JSON escaped double backslash ('\\').
2546
+ # json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
2547
+
2548
+ # # 2. **UNDO ESCAPING:** The string currently contains '\\' where you need '\' for LaTeX.
2549
+ # # We replace the JSON-escaped double backslash ('\\') with a single literal backslash ('\').
2550
+ # final_output_content = json_str.replace('\\\\', '\\')
2551
+
2552
+ # # 3. Write the corrected string content to the file.
2553
+ # with open(final_output_path, 'w', encoding='utf-8') as f:
2554
+ # f.write(final_output_content)
2555
+
2556
+ # print(f"\nβœ… Final Data Saved: {final_output_path}")
2557
+ # else:
2558
+ # print("\n❌ Pipeline Failed.")
2559
+ # sys.exit(1)
2560
+
2561
+
2562
+
2563
+
2564
  if __name__ == "__main__":
2565
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2566
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
 
2586
  args.layoutlmv3_model_path )
2587
  # -----------------------------
2588
 
2589
+ # πŸ›‘ CRITICAL FINAL FIX: AGGRESSIVE CUSTOM JSON SAVING πŸ›‘
2590
  if final_json_data:
2591
  # 1. Dump the Python object to a standard JSON string.
2592
+ # This converts the in-memory double backslash ('\\') into a quadruple backslash ('\\\\')
2593
+ # in the raw json_str string content.
2594
  json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
2595
 
2596
+ # 2. **AGGRESSIVE UNDO ESCAPING:** We assume we have quadruple backslashes and
2597
+ # replace them with the double backslashes needed for the LaTeX command to work.
2598
+ # This operation essentially replaces four literal backslashes with two literal backslashes.
2599
+ final_output_content = json_str.replace('\\\\\\\\', '\\\\')
2600
 
2601
  # 3. Write the corrected string content to the file.
2602
  with open(final_output_path, 'w', encoding='utf-8') as f: