heerjtdev commited on
Commit
b5a5969
Β·
verified Β·
1 Parent(s): becd980

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +142 -17
working_yolo_pipeline.py CHANGED
@@ -2420,6 +2420,137 @@ import time
2420
  import traceback
2421
  import glob
2422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2423
  def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2424
  if not os.path.exists(input_pdf_path):
2425
  print(f"❌ ERROR: File not found: {input_pdf_path}")
@@ -2438,14 +2569,10 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2438
  preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2439
  raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2440
 
2441
-
2442
- # If the user didn't provide a path, create one in the temp directory
2443
  if structured_intermediate_output_path is None:
2444
  structured_intermediate_output_path = os.path.join(
2445
  temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
2446
  )
2447
-
2448
-
2449
 
2450
  final_result = None
2451
  try:
@@ -2468,7 +2595,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2468
  print("❌ FAILED at Step 2: Inference returned no data.")
2469
  return None
2470
 
2471
- # Save raw predictions for Step 3
2472
  with open(raw_output_path, 'w', encoding='utf-8') as f:
2473
  json.dump(page_raw_predictions_list, f, indent=4)
2474
  print(f"βœ… Step 2 Complete ({time.time() - p2_start:.2f}s)")
@@ -2483,7 +2609,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2483
  print("❌ FAILED at Step 3: BIO conversion failed.")
2484
  return None
2485
 
2486
- # Logic adjustments
2487
  print("... Correcting misalignments and linking context ...")
2488
  structured_data_list = correct_misaligned_options(structured_data_list)
2489
  structured_data_list = process_context_linking(structured_data_list)
@@ -2498,20 +2623,11 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2498
  return None
2499
  print(f"βœ… Step 4 Complete ({time.time() - p4_start:.2f}s)")
2500
 
2501
-
2502
-
2503
- # --- ADD THIS NEW STEP HERE ---
2504
  print(f"\n[Step 4.5/5] Adding Question Type Classification...")
2505
  p4_5_start = time.time()
2506
  final_result = add_question_type_validation(final_result)
2507
  print(f"βœ… Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
2508
-
2509
-
2510
-
2511
- # --- END OF NEW STEP ---
2512
-
2513
-
2514
-
2515
 
2516
  # --- Phase 5: Hierarchical Tagging ---
2517
  print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
@@ -2523,6 +2639,16 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2523
  else:
2524
  print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
2525
 
 
 
 
 
 
 
 
 
 
 
2526
  except Exception as e:
2527
  print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
2528
  print(f"Error Message: {str(e)}")
@@ -2550,7 +2676,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2550
 
2551
 
2552
 
2553
-
2554
  if __name__ == "__main__":
2555
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2556
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
 
2420
  import traceback
2421
  import glob
2422
 
2423
+ # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2424
+ # if not os.path.exists(input_pdf_path):
2425
+ # print(f"❌ ERROR: File not found: {input_pdf_path}")
2426
+ # return None
2427
+
2428
+ # print("\n" + "#" * 80)
2429
+ # print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2430
+ # print(f"Input: {input_pdf_path}")
2431
+ # print("#" * 80)
2432
+
2433
+ # overall_start = time.time()
2434
+ # pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2435
+ # temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2436
+ # os.makedirs(temp_pipeline_dir, exist_ok=True)
2437
+
2438
+ # preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2439
+ # raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2440
+
2441
+
2442
+ # # If the user didn't provide a path, create one in the temp directory
2443
+ # if structured_intermediate_output_path is None:
2444
+ # structured_intermediate_output_path = os.path.join(
2445
+ # temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
2446
+ # )
2447
+
2448
+
2449
+
2450
+ # final_result = None
2451
+ # try:
2452
+ # # --- Phase 1: Preprocessing ---
2453
+ # print(f"\n[Step 1/5] Preprocessing (YOLO + Masking)...")
2454
+ # p1_start = time.time()
2455
+ # preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2456
+ # if not preprocessed_json_path_out:
2457
+ # print("❌ FAILED at Step 1: Preprocessing returned None.")
2458
+ # return None
2459
+ # print(f"βœ… Step 1 Complete ({time.time() - p1_start:.2f}s)")
2460
+
2461
+ # # --- Phase 2: Inference ---
2462
+ # print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
2463
+ # p2_start = time.time()
2464
+ # page_raw_predictions_list = run_inference_and_get_raw_words(
2465
+ # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2466
+ # )
2467
+ # if not page_raw_predictions_list:
2468
+ # print("❌ FAILED at Step 2: Inference returned no data.")
2469
+ # return None
2470
+
2471
+ # # Save raw predictions for Step 3
2472
+ # with open(raw_output_path, 'w', encoding='utf-8') as f:
2473
+ # json.dump(page_raw_predictions_list, f, indent=4)
2474
+ # print(f"βœ… Step 2 Complete ({time.time() - p2_start:.2f}s)")
2475
+
2476
+ # # --- Phase 3: Decoding ---
2477
+ # print(f"\n[Step 3/5] Decoding (BIO to Structured JSON)...")
2478
+ # p3_start = time.time()
2479
+ # structured_data_list = convert_bio_to_structured_json_relaxed(
2480
+ # raw_output_path, structured_intermediate_output_path
2481
+ # )
2482
+ # if not structured_data_list:
2483
+ # print("❌ FAILED at Step 3: BIO conversion failed.")
2484
+ # return None
2485
+
2486
+ # # Logic adjustments
2487
+ # print("... Correcting misalignments and linking context ...")
2488
+ # structured_data_list = correct_misaligned_options(structured_data_list)
2489
+ # structured_data_list = process_context_linking(structured_data_list)
2490
+ # print(f"βœ… Step 3 Complete ({time.time() - p3_start:.2f}s)")
2491
+
2492
+ # # --- Phase 4: Base64 & LaTeX ---
2493
+ # print(f"\n[Step 4/5] Finalizing Layout (Base64 Images & LaTeX)...")
2494
+ # p4_start = time.time()
2495
+ # final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2496
+ # if not final_result:
2497
+ # print("❌ FAILED at Step 4: Final formatting failed.")
2498
+ # return None
2499
+ # print(f"βœ… Step 4 Complete ({time.time() - p4_start:.2f}s)")
2500
+
2501
+
2502
+
2503
+ # # --- ADD THIS NEW STEP HERE ---
2504
+ # print(f"\n[Step 4.5/5] Adding Question Type Classification...")
2505
+ # p4_5_start = time.time()
2506
+ # final_result = add_question_type_validation(final_result)
2507
+ # print(f"βœ… Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
2508
+
2509
+
2510
+
2511
+ # # --- END OF NEW STEP ---
2512
+
2513
+
2514
+
2515
+
2516
+ # # --- Phase 5: Hierarchical Tagging ---
2517
+ # print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
2518
+ # p5_start = time.time()
2519
+ # classifier = HierarchicalClassifier()
2520
+ # if classifier.load_models():
2521
+ # final_result = post_process_json_with_inference(final_result, classifier)
2522
+ # print(f"βœ… Step 5 Complete: Tags added ({time.time() - p5_start:.2f}s)")
2523
+ # else:
2524
+ # print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
2525
+
2526
+ # except Exception as e:
2527
+ # print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
2528
+ # print(f"Error Message: {str(e)}")
2529
+ # traceback.print_exc()
2530
+ # return None
2531
+
2532
+ # finally:
2533
+ # print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
2534
+ # try:
2535
+ # for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2536
+ # os.remove(f)
2537
+ # os.rmdir(temp_pipeline_dir)
2538
+ # print("🧹 Cleanup successful.")
2539
+ # except Exception as e:
2540
+ # print(f"⚠️ Cleanup failed: {e}")
2541
+
2542
+ # total_time = time.time() - overall_start
2543
+ # print("\n" + "#" * 80)
2544
+ # print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
2545
+ # print("#" * 80)
2546
+
2547
+ # return final_result
2548
+
2549
+
2550
+
2551
+
2552
+
2553
+
2554
  def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2555
  if not os.path.exists(input_pdf_path):
2556
  print(f"❌ ERROR: File not found: {input_pdf_path}")
 
2569
  preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2570
  raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2571
 
 
 
2572
  if structured_intermediate_output_path is None:
2573
  structured_intermediate_output_path = os.path.join(
2574
  temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
2575
  )
 
 
2576
 
2577
  final_result = None
2578
  try:
 
2595
  print("❌ FAILED at Step 2: Inference returned no data.")
2596
  return None
2597
 
 
2598
  with open(raw_output_path, 'w', encoding='utf-8') as f:
2599
  json.dump(page_raw_predictions_list, f, indent=4)
2600
  print(f"βœ… Step 2 Complete ({time.time() - p2_start:.2f}s)")
 
2609
  print("❌ FAILED at Step 3: BIO conversion failed.")
2610
  return None
2611
 
 
2612
  print("... Correcting misalignments and linking context ...")
2613
  structured_data_list = correct_misaligned_options(structured_data_list)
2614
  structured_data_list = process_context_linking(structured_data_list)
 
2623
  return None
2624
  print(f"βœ… Step 4 Complete ({time.time() - p4_start:.2f}s)")
2625
 
2626
+ # --- Phase 4.5: Question Type Classification ---
 
 
2627
  print(f"\n[Step 4.5/5] Adding Question Type Classification...")
2628
  p4_5_start = time.time()
2629
  final_result = add_question_type_validation(final_result)
2630
  print(f"βœ… Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
 
 
 
 
 
 
 
2631
 
2632
  # --- Phase 5: Hierarchical Tagging ---
2633
  print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
 
2639
  else:
2640
  print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
2641
 
2642
+ # ============================================================
2643
+ # πŸ”§ NEW STEP: FILTER OUT METADATA ENTRIES
2644
+ # ============================================================
2645
+ print(f"\n[Post-Processing] Removing METADATA entries...")
2646
+ initial_count = len(final_result)
2647
+ final_result = [item for item in final_result if item.get('type') != 'METADATA']
2648
+ removed_count = initial_count - len(final_result)
2649
+ print(f"βœ… Removed {removed_count} METADATA entries. {len(final_result)} questions remain.")
2650
+ # ============================================================
2651
+
2652
  except Exception as e:
2653
  print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
2654
  print(f"Error Message: {str(e)}")
 
2676
 
2677
 
2678
 
 
2679
  if __name__ == "__main__":
2680
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2681
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")