heerjtdev commited on
Commit
184edf5
·
verified ·
1 Parent(s): c3f83a2

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +52 -6
working_yolo_pipeline.py CHANGED
@@ -2476,6 +2476,44 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
2476
 
2477
 
2478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2479
  if __name__ == "__main__":
2480
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2481
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
@@ -2490,7 +2528,7 @@ if __name__ == "__main__":
2490
  pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2491
  final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2492
 
2493
- # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
2494
  # raw_predictions_output_path = os.path.abspath(
2495
  # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
2496
  # ---------------------------------------------
@@ -2501,13 +2539,21 @@ if __name__ == "__main__":
2501
  args.layoutlmv3_model_path )
2502
  # -----------------------------
2503
 
 
2504
  if final_json_data:
 
 
 
 
 
 
 
 
 
2505
  with open(final_output_path, 'w', encoding='utf-8') as f:
2506
- json.dump(final_json_data, f, indent=2, ensure_ascii=False)
 
2507
  print(f"\n✅ Final Data Saved: {final_output_path}")
2508
  else:
2509
  print("\n❌ Pipeline Failed.")
2510
- sys.exit(1)
2511
-
2512
-
2513
-
 
2476
 
2477
 
2478
 
2479
+ # if __name__ == "__main__":
2480
+ # parser = argparse.ArgumentParser(description="Complete Pipeline")
2481
+ # parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
2482
+ # parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
2483
+
2484
+ # # --- ADDED ARGUMENT FOR DEBUGGING ---
2485
+ # parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
2486
+ # help="Debug path for raw BIO tag predictions (JSON).")
2487
+ # # ------------------------------------
2488
+ # args = parser.parse_args()
2489
+
2490
+ # pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2491
+ # final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2492
+
2493
+ # # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
2494
+ # # raw_predictions_output_path = os.path.abspath(
2495
+ # # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
2496
+ # # ---------------------------------------------
2497
+
2498
+ # # --- UPDATED FUNCTION CALL ---
2499
+ # final_json_data = run_document_pipeline(
2500
+ # args.input_pdf,
2501
+ # args.layoutlmv3_model_path )
2502
+ # # -----------------------------
2503
+
2504
+
2505
+
2506
+ # if final_json_data:
2507
+ # with open(final_output_path, 'w', encoding='utf-8') as f:
2508
+ # json.dump(final_json_data, f, indent=2, ensure_ascii=False)
2509
+ # print(f"\n✅ Final Data Saved: {final_output_path}")
2510
+ # else:
2511
+ # print("\n❌ Pipeline Failed.")
2512
+ # sys.exit(1)
2513
+
2514
+
2515
+
2516
+
2517
  if __name__ == "__main__":
2518
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2519
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
 
2528
  pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2529
  final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2530
 
2531
+ # --- CALCULATE RAW PREDICTIONS OUTPUT PATH (Kept commented as per original script) ---
2532
  # raw_predictions_output_path = os.path.abspath(
2533
  # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
2534
  # ---------------------------------------------
 
2539
  args.layoutlmv3_model_path )
2540
  # -----------------------------
2541
 
2542
+ # 🛑 CRITICAL FINAL FIX: CUSTOM JSON SAVING TO REMOVE ESCAPING 🛑
2543
  if final_json_data:
2544
+ # 1. Dump the Python object to a standard JSON string.
2545
+ # This converts the in-memory single backslash ('\') into the JSON escaped double backslash ('\\').
2546
+ json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
2547
+
2548
+ # 2. **UNDO ESCAPING:** The string currently contains '\\' where you need '\' for LaTeX.
2549
+ # We replace the JSON-escaped double backslash ('\\') with a single literal backslash ('\').
2550
+ final_output_content = json_str.replace('\\\\', '\\')
2551
+
2552
+ # 3. Write the corrected string content to the file.
2553
  with open(final_output_path, 'w', encoding='utf-8') as f:
2554
+ f.write(final_output_content)
2555
+
2556
  print(f"\n✅ Final Data Saved: {final_output_path}")
2557
  else:
2558
  print("\n❌ Pipeline Failed.")
2559
+ sys.exit(1)