heerjtdev commited on
Commit
12283fc
·
verified ·
1 Parent(s): 1a1c9b7

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +3 -83
working_yolo_pipeline.py CHANGED
@@ -2047,8 +2047,9 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2047
 
2048
 
2049
 
2050
- def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2051
- List[Dict[str, Any]]]:
 
2052
  if not os.path.exists(input_pdf_path): return None
2053
 
2054
  print("\n" + "#" * 80)
@@ -2118,29 +2119,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
2118
  return final_result
2119
 
2120
 
2121
-
2122
-
2123
- # if __name__ == "__main__":
2124
- # parser = argparse.ArgumentParser(description="Complete Pipeline")
2125
- # parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
2126
- # parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
2127
- # parser.add_argument("--ls_output_path", type=str, default=None, help="Label Studio Output Path")
2128
- # args = parser.parse_args()
2129
-
2130
- # pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2131
- # final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2132
- # ls_output_path = os.path.abspath(
2133
- # args.ls_output_path if args.ls_output_path else f"{pdf_name}_label_studio_tasks.json")
2134
-
2135
- # final_json_data = run_document_pipeline(args.input_pdf, args.layoutlmv3_model_path, ls_output_path)
2136
-
2137
- # if final_json_data:
2138
- # with open(final_output_path, 'w', encoding='utf-8') as f:
2139
- # json.dump(final_json_data, f, indent=2, ensure_ascii=False)
2140
- # print(f"\n✅ Final Data Saved: {final_output_path}")
2141
- # else:
2142
- # print("\n❌ Pipeline Failed.")
2143
- # sys.exit(1)
2144
 
2145
 
2146
 
@@ -2180,61 +2158,3 @@ if __name__ == "__main__":
2180
 
2181
 
2182
 
2183
-
2184
-
2185
-
2186
- # if __name__ == "__main__":
2187
- # # Ensure 'json', 'argparse', 'os', and 'sys' are imported at the top of your script
2188
- # # import json
2189
- # # import argparse
2190
- # # import os
2191
- # # import sys
2192
- #
2193
- # parser = argparse.ArgumentParser(description="Complete Pipeline")
2194
- # parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
2195
- # parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
2196
- # parser.add_argument("--ls_output_path", type=str, default=None, help="Label Studio Output Path")
2197
- # # --- ADDED ARGUMENT FOR DEBUGGING ---
2198
- # parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
2199
- # help="Debug path for raw BIO tag predictions (JSON).")
2200
- # # ------------------------------------
2201
- # args = parser.parse_args()
2202
- #
2203
- # pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2204
- # final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2205
- # ls_output_path = os.path.abspath(
2206
- # args.ls_output_path if args.ls_output_path else f"{pdf_name}_label_studio_tasks.json")
2207
- # # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
2208
- # # raw_predictions_output_path = os.path.abspath(
2209
- # # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
2210
- # # ---------------------------------------------
2211
- #
2212
- # # --- UPDATED FUNCTION CALL ---
2213
- # final_json_data = run_document_pipeline(
2214
- # args.input_pdf,
2215
- # args.layoutlmv3_model_path,
2216
- # ls_output_path,
2217
- # # raw_predictions_output_path # Pass the new argument
2218
- # )
2219
- # # -----------------------------
2220
- #
2221
- # # 🛑 CRITICAL FIX: CUSTOM JSON SAVING TO REMOVE DOUBLE BACKSLASHES 🛑
2222
- # if final_json_data:
2223
- # # 1. Dump the Python object to a standard JSON string.
2224
- # # This uses json.dumps which correctly escapes single backslashes ('\') to ('\\').
2225
- # json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
2226
- #
2227
- # # 2. **UNDO ESCAPING:** Replace every instance of the JSON-escaped backslash ('\\')
2228
- # # with a single literal backslash ('\'). This forces the file content to be correct for LaTeX.
2229
- # final_output_content = json_str.replace('\\\\', '\\')
2230
- #
2231
- # # 3. Write the corrected string content to the file.
2232
- # with open(final_output_path, 'w', encoding='utf-8') as f:
2233
- # f.write(final_output_content)
2234
- #
2235
- # print(f"\n✅ Final Data Saved: {final_output_path}")
2236
- # else:
2237
- # print("\n❌ Pipeline Failed.")
2238
- # sys.exit(1)
2239
-
2240
-
 
2047
 
2048
 
2049
 
2050
+ # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2051
+ # List[Dict[str, Any]]]:
2052
+ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2053
  if not os.path.exists(input_pdf_path): return None
2054
 
2055
  print("\n" + "#" * 80)
 
2119
  return final_result
2120
 
2121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2122
 
2123
 
2124
 
 
2158
 
2159
 
2160