heerjtdev commited on
Commit
84f68b6
·
verified ·
1 Parent(s): 389d827

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +8 -8
working_yolo_pipeline.py CHANGED
@@ -1223,7 +1223,7 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1223
  return []
1224
 
1225
  try:
1226
- with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
1227
  preprocessed_data = json.load(f)
1228
  print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
1229
  except Exception:
@@ -1288,7 +1288,7 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1288
 
1289
  # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
1290
  chunk_words = [
1291
- str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
1292
  for t in chunk_tokens
1293
  ]
1294
  chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
@@ -1520,7 +1520,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1520
  print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
1521
  print("=" * 80)
1522
  try:
1523
- with open(input_path, 'r', encoding='utf-8') as f:
1524
  predictions_by_page = json.load(f)
1525
  except Exception as e:
1526
  print(f"❌ Error loading raw prediction file: {e}")
@@ -1666,7 +1666,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1666
  item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
1667
 
1668
  try:
1669
- with open(output_path, 'w', encoding='utf-8') as f:
1670
  json.dump(structured_data, f, indent=2, ensure_ascii=False)
1671
  except Exception:
1672
  pass
@@ -1784,7 +1784,7 @@ def process_context_linking(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1784
 
1785
  # 2. Safely clean the string by encoding to UTF-8 and ignoring errors,
1786
  # then decoding back. This removes the invalid surrogate character.
1787
- q_preview = q_preview_raw.encode('utf-8', errors='ignore').decode('utf-8')
1788
  # ------------------------------------------------------------------
1789
 
1790
  # RESOLUTION LOGIC
@@ -1894,7 +1894,7 @@ def get_base64_for_file(filepath: str) -> Optional[str]:
1894
  try:
1895
  with open(filepath, "rb") as image_file:
1896
  # Return raw base64 string
1897
- return base64.b64encode(image_file.read()).decode('utf-8')
1898
  except Exception as e:
1899
  print(f"Error reading and encoding file {filepath}: {e}")
1900
  return None
@@ -2028,7 +2028,7 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
2028
 
2029
  # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2030
  # Save raw predictions to the temporary file
2031
- with open(raw_output_path, 'w', encoding='utf-8') as f:
2032
  json.dump(page_raw_predictions_list, f, indent=4)
2033
 
2034
  # Explicitly copy/save the raw predictions to the user-specified debug path
@@ -2137,7 +2137,7 @@ if __name__ == "__main__":
2137
  # final_output_content = json_str.replace('\\\\\\\\', '\\\\')
2138
 
2139
  # 3. Write the corrected string content to the file.
2140
- with open(final_output_path, 'w', encoding='utf-8') as f:
2141
  f.write(final_output_content)
2142
 
2143
  print(f"\n✅ Final Data Saved: {final_output_path}")
 
1223
  return []
1224
 
1225
  try:
1226
+ with open(preprocessed_json_path, 'r', encoding='utf-16') as f:
1227
  preprocessed_data = json.load(f)
1228
  print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
1229
  except Exception:
 
1288
 
1289
  # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
1290
  chunk_words = [
1291
+ str(t['word']).encode('utf-16', errors='ignore').decode('utf-16')
1292
  for t in chunk_tokens
1293
  ]
1294
  chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
 
1520
  print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
1521
  print("=" * 80)
1522
  try:
1523
+ with open(input_path, 'r', encoding='utf-16') as f:
1524
  predictions_by_page = json.load(f)
1525
  except Exception as e:
1526
  print(f"❌ Error loading raw prediction file: {e}")
 
1666
  item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
1667
 
1668
  try:
1669
+ with open(output_path, 'w', encoding='utf-16') as f:
1670
  json.dump(structured_data, f, indent=2, ensure_ascii=False)
1671
  except Exception:
1672
  pass
 
1784
 
1785
  # 2. Safely clean the string by encoding to UTF-8 and ignoring errors,
1786
  # then decoding back. This removes the invalid surrogate character.
1787
+ q_preview = q_preview_raw.encode('utf-16', errors='ignore').decode('utf-16')
1788
  # ------------------------------------------------------------------
1789
 
1790
  # RESOLUTION LOGIC
 
1894
  try:
1895
  with open(filepath, "rb") as image_file:
1896
  # Return raw base64 string
1897
+ return base64.b64encode(image_file.read()).decode('utf-16')
1898
  except Exception as e:
1899
  print(f"Error reading and encoding file {filepath}: {e}")
1900
  return None
 
2028
 
2029
  # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2030
  # Save raw predictions to the temporary file
2031
+ with open(raw_output_path, 'w', encoding='utf-16') as f:
2032
  json.dump(page_raw_predictions_list, f, indent=4)
2033
 
2034
  # Explicitly copy/save the raw predictions to the user-specified debug path
 
2137
  # final_output_content = json_str.replace('\\\\\\\\', '\\\\')
2138
 
2139
  # 3. Write the corrected string content to the file.
2140
+ with open(final_output_path, 'w', encoding='utf-16') as f:
2141
  f.write(final_output_content)
2142
 
2143
  print(f"\n✅ Final Data Saved: {final_output_path}")