Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Dec 5, 2025

Commit

84f68b6

verified ·

1 Parent(s): 389d827

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +8 -8

working_yolo_pipeline.py CHANGED Viewed

@@ -1223,7 +1223,7 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
         return []
     try:
-        with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
             preprocessed_data = json.load(f)
         print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
     except Exception:
@@ -1288,7 +1288,7 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
             # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
             chunk_words = [
-                str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
                 for t in chunk_tokens
             ]
             chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
@@ -1520,7 +1520,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
     print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
     print("=" * 80)
     try:
-        with open(input_path, 'r', encoding='utf-8') as f:
             predictions_by_page = json.load(f)
     except Exception as e:
         print(f"❌ Error loading raw prediction file: {e}")
@@ -1666,7 +1666,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
             item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
     try:
-        with open(output_path, 'w', encoding='utf-8') as f:
             json.dump(structured_data, f, indent=2, ensure_ascii=False)
     except Exception:
         pass
@@ -1784,7 +1784,7 @@ def process_context_linking(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             # 2. Safely clean the string by encoding to UTF-8 and ignoring errors,
             #    then decoding back. This removes the invalid surrogate character.
-            q_preview = q_preview_raw.encode('utf-8', errors='ignore').decode('utf-8')
             # ------------------------------------------------------------------
             # RESOLUTION LOGIC
@@ -1894,7 +1894,7 @@ def get_base64_for_file(filepath: str) -> Optional[str]:
     try:
         with open(filepath, "rb") as image_file:
             # Return raw base64 string
-            return base64.b64encode(image_file.read()).decode('utf-8')
     except Exception as e:
         print(f"Error reading and encoding file {filepath}: {e}")
         return None
@@ -2028,7 +2028,7 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
         # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
         # Save raw predictions to the temporary file
-        with open(raw_output_path, 'w', encoding='utf-8') as f:
             json.dump(page_raw_predictions_list, f, indent=4)
         # Explicitly copy/save the raw predictions to the user-specified debug path
@@ -2137,7 +2137,7 @@ if __name__ == "__main__":
         # final_output_content = json_str.replace('\\\\\\\\', '\\\\')
         # 3. Write the corrected string content to the file.
-        with open(final_output_path, 'w', encoding='utf-8') as f:
             f.write(final_output_content)
         print(f"\n✅ Final Data Saved: {final_output_path}")

         return []
     try:
+        with open(preprocessed_json_path, 'r', encoding='utf-16') as f:
             preprocessed_data = json.load(f)
         print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
     except Exception:
             # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
             chunk_words = [
+                str(t['word']).encode('utf-16', errors='ignore').decode('utf-16')
                 for t in chunk_tokens
             ]
             chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
     print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
     print("=" * 80)
     try:
+        with open(input_path, 'r', encoding='utf-16') as f:
             predictions_by_page = json.load(f)
     except Exception as e:
         print(f"❌ Error loading raw prediction file: {e}")
             item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
     try:
+        with open(output_path, 'w', encoding='utf-16') as f:
             json.dump(structured_data, f, indent=2, ensure_ascii=False)
     except Exception:
         pass
             # 2. Safely clean the string by encoding to UTF-8 and ignoring errors,
             #    then decoding back. This removes the invalid surrogate character.
+            q_preview = q_preview_raw.encode('utf-16', errors='ignore').decode('utf-16')
             # ------------------------------------------------------------------
             # RESOLUTION LOGIC
     try:
         with open(filepath, "rb") as image_file:
             # Return raw base64 string
+            return base64.b64encode(image_file.read()).decode('utf-16')
     except Exception as e:
         print(f"Error reading and encoding file {filepath}: {e}")
         return None
         # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
         # Save raw predictions to the temporary file
+        with open(raw_output_path, 'w', encoding='utf-16') as f:
             json.dump(page_raw_predictions_list, f, indent=4)
         # Explicitly copy/save the raw predictions to the user-specified debug path
         # final_output_content = json_str.replace('\\\\\\\\', '\\\\')
         # 3. Write the corrected string content to the file.
+        with open(final_output_path, 'w', encoding='utf-16') as f:
             f.write(final_output_content)
         print(f"\n✅ Final Data Saved: {final_output_path}")