Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +8 -8
working_yolo_pipeline.py
CHANGED
|
@@ -1223,7 +1223,7 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
|
|
| 1223 |
return []
|
| 1224 |
|
| 1225 |
try:
|
| 1226 |
-
with open(preprocessed_json_path, 'r', encoding='utf-
|
| 1227 |
preprocessed_data = json.load(f)
|
| 1228 |
print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
|
| 1229 |
except Exception:
|
|
@@ -1288,7 +1288,7 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
|
|
| 1288 |
|
| 1289 |
# 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
|
| 1290 |
chunk_words = [
|
| 1291 |
-
str(t['word']).encode('utf-
|
| 1292 |
for t in chunk_tokens
|
| 1293 |
]
|
| 1294 |
chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
|
|
@@ -1520,7 +1520,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1520 |
print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
|
| 1521 |
print("=" * 80)
|
| 1522 |
try:
|
| 1523 |
-
with open(input_path, 'r', encoding='utf-
|
| 1524 |
predictions_by_page = json.load(f)
|
| 1525 |
except Exception as e:
|
| 1526 |
print(f"❌ Error loading raw prediction file: {e}")
|
|
@@ -1666,7 +1666,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1666 |
item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
|
| 1667 |
|
| 1668 |
try:
|
| 1669 |
-
with open(output_path, 'w', encoding='utf-
|
| 1670 |
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
| 1671 |
except Exception:
|
| 1672 |
pass
|
|
@@ -1784,7 +1784,7 @@ def process_context_linking(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
| 1784 |
|
| 1785 |
# 2. Safely clean the string by encoding to UTF-8 and ignoring errors,
|
| 1786 |
# then decoding back. This removes the invalid surrogate character.
|
| 1787 |
-
q_preview = q_preview_raw.encode('utf-
|
| 1788 |
# ------------------------------------------------------------------
|
| 1789 |
|
| 1790 |
# RESOLUTION LOGIC
|
|
@@ -1894,7 +1894,7 @@ def get_base64_for_file(filepath: str) -> Optional[str]:
|
|
| 1894 |
try:
|
| 1895 |
with open(filepath, "rb") as image_file:
|
| 1896 |
# Return raw base64 string
|
| 1897 |
-
return base64.b64encode(image_file.read()).decode('utf-
|
| 1898 |
except Exception as e:
|
| 1899 |
print(f"Error reading and encoding file {filepath}: {e}")
|
| 1900 |
return None
|
|
@@ -2028,7 +2028,7 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
|
|
| 2028 |
|
| 2029 |
# --- DEBUG STEP: SAVE RAW PREDICTIONS ---
|
| 2030 |
# Save raw predictions to the temporary file
|
| 2031 |
-
with open(raw_output_path, 'w', encoding='utf-
|
| 2032 |
json.dump(page_raw_predictions_list, f, indent=4)
|
| 2033 |
|
| 2034 |
# Explicitly copy/save the raw predictions to the user-specified debug path
|
|
@@ -2137,7 +2137,7 @@ if __name__ == "__main__":
|
|
| 2137 |
# final_output_content = json_str.replace('\\\\\\\\', '\\\\')
|
| 2138 |
|
| 2139 |
# 3. Write the corrected string content to the file.
|
| 2140 |
-
with open(final_output_path, 'w', encoding='utf-
|
| 2141 |
f.write(final_output_content)
|
| 2142 |
|
| 2143 |
print(f"\n✅ Final Data Saved: {final_output_path}")
|
|
|
|
| 1223 |
return []
|
| 1224 |
|
| 1225 |
try:
|
| 1226 |
+
with open(preprocessed_json_path, 'r', encoding='utf-16') as f:
|
| 1227 |
preprocessed_data = json.load(f)
|
| 1228 |
print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
|
| 1229 |
except Exception:
|
|
|
|
| 1288 |
|
| 1289 |
# 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
|
| 1290 |
chunk_words = [
|
| 1291 |
+
str(t['word']).encode('utf-16', errors='ignore').decode('utf-16')
|
| 1292 |
for t in chunk_tokens
|
| 1293 |
]
|
| 1294 |
chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
|
|
|
|
| 1520 |
print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
|
| 1521 |
print("=" * 80)
|
| 1522 |
try:
|
| 1523 |
+
with open(input_path, 'r', encoding='utf-16') as f:
|
| 1524 |
predictions_by_page = json.load(f)
|
| 1525 |
except Exception as e:
|
| 1526 |
print(f"❌ Error loading raw prediction file: {e}")
|
|
|
|
| 1666 |
item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
|
| 1667 |
|
| 1668 |
try:
|
| 1669 |
+
with open(output_path, 'w', encoding='utf-16') as f:
|
| 1670 |
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
| 1671 |
except Exception:
|
| 1672 |
pass
|
|
|
|
| 1784 |
|
| 1785 |
# 2. Safely clean the string by encoding to UTF-8 and ignoring errors,
|
| 1786 |
# then decoding back. This removes the invalid surrogate character.
|
| 1787 |
+
q_preview = q_preview_raw.encode('utf-16', errors='ignore').decode('utf-16')
|
| 1788 |
# ------------------------------------------------------------------
|
| 1789 |
|
| 1790 |
# RESOLUTION LOGIC
|
|
|
|
| 1894 |
try:
|
| 1895 |
with open(filepath, "rb") as image_file:
|
| 1896 |
# Return raw base64 string
|
| 1897 |
+
return base64.b64encode(image_file.read()).decode('utf-16')
|
| 1898 |
except Exception as e:
|
| 1899 |
print(f"Error reading and encoding file {filepath}: {e}")
|
| 1900 |
return None
|
|
|
|
| 2028 |
|
| 2029 |
# --- DEBUG STEP: SAVE RAW PREDICTIONS ---
|
| 2030 |
# Save raw predictions to the temporary file
|
| 2031 |
+
with open(raw_output_path, 'w', encoding='utf-16') as f:
|
| 2032 |
json.dump(page_raw_predictions_list, f, indent=4)
|
| 2033 |
|
| 2034 |
# Explicitly copy/save the raw predictions to the user-specified debug path
|
|
|
|
| 2137 |
# final_output_content = json_str.replace('\\\\\\\\', '\\\\')
|
| 2138 |
|
| 2139 |
# 3. Write the corrected string content to the file.
|
| 2140 |
+
with open(final_output_path, 'w', encoding='utf-16') as f:
|
| 2141 |
f.write(final_output_content)
|
| 2142 |
|
| 2143 |
print(f"\n✅ Final Data Saved: {final_output_path}")
|