Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Jan 16

Commit

6af814c

verified ·

1 Parent(s): 1d00dbc

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -46

app.py CHANGED Viewed

@@ -139,13 +139,13 @@
 import gradio as gr
 import json
 import os
 import tempfile
 import img2pdf
 import glob
 from img2pdf import Rotation
 from pathlib import Path
@@ -164,15 +164,11 @@ except ImportError:
 def process_file(uploaded_files, layoutlmv3_model_path=None):
     """
     Robust handler for multiple or single file uploads.
-    Returns the final JSON and the file path for download.
-    If the pipeline fails at BIO conversion, it attempts to return the raw predictions for debugging.
     """
     if uploaded_files is None:
         return "❌ Error: No files uploaded.", None
-    # --- THE ROBUST FIX ---
-    # Gradio sometimes sends a single dict even when set to multiple.
-    # We force everything into a list so the rest of the logic doesn't break.
     if not isinstance(uploaded_files, list):
         file_list = [uploaded_files]
     else:
@@ -180,7 +176,6 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
     if len(file_list) == 0:
         return "❌ Error: Empty file list.", None
-    # ----------------------
     # 1. Resolve all file paths safely
     resolved_paths = []
@@ -203,7 +198,6 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
     is_image = first_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tiff']
     try:
-        # If it's multiple files or just one image, wrap it in a PDF
         if len(resolved_paths) > 1 or is_image:
             print(f"📦 Converting {len(resolved_paths)} image(s) to a single PDF...")
             temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
@@ -211,7 +205,6 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
                 f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
             processing_path = temp_pdf.name
         else:
-            # It's a single PDF
             processing_path = resolved_paths[0]
         # 3. Standard Pipeline Checks
@@ -223,38 +216,35 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
         print(f"🚀 Starting pipeline for: {processing_path}")
         result = run_document_pipeline(processing_path, final_model_path)
-        # --- DEBUGGING LOGIC FOR STEP 3 FAILURE ---
         if result is None or (isinstance(result, list) and len(result) == 0):
-            print("⚠️ Pipeline returned no structured data. Looking for raw predictions for debugging...")
-            # Based on your logs, the pipeline creates a folder like /tmp/pipeline_run_[filename]
-            base_name = Path(processing_path).stem
-            search_pattern = f"/tmp/pipeline_run_{base_name}*/*_raw_predictions.json"
-            possible_files = glob.glob(search_pattern)
-            if possible_files:
-                debug_file = possible_files[0]
-                print(f"🔍 DEBUG: Found raw predictions at {debug_file}")
-                with open(debug_file, 'r', encoding='utf-8') as f:
-                    raw_data = json.load(f)
-                # Return the raw labels to the UI so you can see why it failed
-                return (
-                    "⚠️ WARNING: BIO Decoding Failed (Step 3).\n"
-                    "Showing RAW LayoutLMv3 predictions instead for analysis:\n\n" +
-                    json.dumps(raw_data, indent=2, ensure_ascii=False),
-                    debug_file
-                )
-            return "❌ Error: Pipeline failed and no intermediate raw prediction file was found.", None
-        # ------------------------------------------
-        # 5. Prepare output (Successful Path)
-        temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
-        with open(temp_output.name, 'w', encoding='utf-8') as f:
-            json.dump(result, f, indent=2, ensure_ascii=False)
-        return json.dumps(result, indent=2, ensure_ascii=False), temp_output.name
     except Exception as e:
         import traceback
@@ -266,8 +256,9 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
 # ==============================
 with gr.Blocks(title="Document Analysis Pipeline") as demo:
-    gr.Markdown("# 📄 Document & Image Analysis Pipeline")
-    gr.Markdown("### 🛠 Debug Mode Active: If Step 3 fails, the Raw Prediction file will be returned.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -283,11 +274,12 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
                 value=DEFAULT_LAYOUTLMV3_MODEL_PATH
             )
-            process_btn = gr.Button("🚀 Process Files", variant="primary")
         with gr.Column(scale=2):
-            json_output = gr.Code(label="JSON Output (Structured or Raw Predictions)", language="json", lines=20)
-            download_output = gr.File(label="Download JSON File")
     process_btn.click(
         fn=process_file,
@@ -296,8 +288,5 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
     )
 if __name__ == "__main__":
-    # Note: 0.0.0.0 allows access from outside the container/host
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

 import gradio as gr
 import json
 import os
 import tempfile
 import img2pdf
 import glob
+import shutil
 from img2pdf import Rotation
 from pathlib import Path
 def process_file(uploaded_files, layoutlmv3_model_path=None):
     """
     Robust handler for multiple or single file uploads.
+    Returns the final JSON and a LIST of all intermediate JSON files (OCR, Predictions, BIO).
     """
     if uploaded_files is None:
         return "❌ Error: No files uploaded.", None
     if not isinstance(uploaded_files, list):
         file_list = [uploaded_files]
     else:
     if len(file_list) == 0:
         return "❌ Error: Empty file list.", None
     # 1. Resolve all file paths safely
     resolved_paths = []
     is_image = first_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tiff']
     try:
         if len(resolved_paths) > 1 or is_image:
             print(f"📦 Converting {len(resolved_paths)} image(s) to a single PDF...")
             temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
                 f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
             processing_path = temp_pdf.name
         else:
             processing_path = resolved_paths[0]
         # 3. Standard Pipeline Checks
         print(f"🚀 Starting pipeline for: {processing_path}")
         result = run_document_pipeline(processing_path, final_model_path)
+        # 5. SCRAPE FOR INTERMEDIATE FILES
+        # We look for all .json files in /tmp/ created during this run
+        base_name = Path(processing_path).stem
+        # This matches common patterns like /tmp/pipeline_run_... or filenames in /tmp/
+        search_patterns = [
+            f"/tmp/pipeline_run_{base_name}*/*.json",
+            f"/tmp/*{base_name}*.json"
+        ]
+        all_intermediate_jsons = []
+        for pattern in search_patterns:
+            all_intermediate_jsons.extend(glob.glob(pattern))
+        # Remove duplicates while preserving order
+        all_intermediate_jsons = list(dict.fromkeys(all_intermediate_jsons))
+        # 6. Prepare Final Output for Display
         if result is None or (isinstance(result, list) and len(result) == 0):
+            display_text = "⚠️ Pipeline failed at Step 3 (BIO Decoding).\nDownload the intermediate JSONs below to inspect OCR and Model Predictions."
+        else:
+            display_text = json.dumps(result, indent=2, ensure_ascii=False)
+            # If the final result succeeded, save it to a temp file so it can be downloaded too
+            temp_final = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='final_result_')
+            json.dump(result, temp_final, indent=2, ensure_ascii=False)
+            temp_final.close()
+            all_intermediate_jsons.append(temp_final.name)
+        return display_text, all_intermediate_jsons
     except Exception as e:
         import traceback
 # ==============================
 with gr.Blocks(title="Document Analysis Pipeline") as demo:
+    gr.Markdown("# 📄 Full Pipeline Analysis")
+    gr.Markdown("### 🔍 Intermediate File Recovery Active")
+    gr.Markdown("The **Download** box will contain: \n1. OCR JSON (Step 1)\n2. Raw LayoutLMv3 Prediction JSON (Step 2)\n3. Final BIO JSON (Step 3)")
     with gr.Row():
         with gr.Column(scale=1):
                 value=DEFAULT_LAYOUTLMV3_MODEL_PATH
             )
+            process_btn = gr.Button("🚀 Run Pipeline", variant="primary")
         with gr.Column(scale=2):
+            json_output = gr.Code(label="Final Structured Output", language="json", lines=20)
+            # IMPORTANT: file_count="multiple" allows returning the list of all stage files
+            download_output = gr.File(label="Download All Pipeline Stages (JSON)", file_count="multiple")
     process_btn.click(
         fn=process_file,
     )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)