heerjtdev commited on
Commit
6af814c
Β·
verified Β·
1 Parent(s): 1d00dbc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -46
app.py CHANGED
@@ -139,13 +139,13 @@
139
 
140
 
141
 
142
-
143
  import gradio as gr
144
  import json
145
  import os
146
  import tempfile
147
  import img2pdf
148
  import glob
 
149
  from img2pdf import Rotation
150
  from pathlib import Path
151
 
@@ -164,15 +164,11 @@ except ImportError:
164
  def process_file(uploaded_files, layoutlmv3_model_path=None):
165
  """
166
  Robust handler for multiple or single file uploads.
167
- Returns the final JSON and the file path for download.
168
- If the pipeline fails at BIO conversion, it attempts to return the raw predictions for debugging.
169
  """
170
  if uploaded_files is None:
171
  return "❌ Error: No files uploaded.", None
172
 
173
- # --- THE ROBUST FIX ---
174
- # Gradio sometimes sends a single dict even when set to multiple.
175
- # We force everything into a list so the rest of the logic doesn't break.
176
  if not isinstance(uploaded_files, list):
177
  file_list = [uploaded_files]
178
  else:
@@ -180,7 +176,6 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
180
 
181
  if len(file_list) == 0:
182
  return "❌ Error: Empty file list.", None
183
- # ----------------------
184
 
185
  # 1. Resolve all file paths safely
186
  resolved_paths = []
@@ -203,7 +198,6 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
203
  is_image = first_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tiff']
204
 
205
  try:
206
- # If it's multiple files or just one image, wrap it in a PDF
207
  if len(resolved_paths) > 1 or is_image:
208
  print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
209
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
@@ -211,7 +205,6 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
211
  f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
212
  processing_path = temp_pdf.name
213
  else:
214
- # It's a single PDF
215
  processing_path = resolved_paths[0]
216
 
217
  # 3. Standard Pipeline Checks
@@ -223,38 +216,35 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
223
  print(f"πŸš€ Starting pipeline for: {processing_path}")
224
  result = run_document_pipeline(processing_path, final_model_path)
225
 
226
- # --- DEBUGGING LOGIC FOR STEP 3 FAILURE ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  if result is None or (isinstance(result, list) and len(result) == 0):
228
- print("⚠️ Pipeline returned no structured data. Looking for raw predictions for debugging...")
229
-
230
- # Based on your logs, the pipeline creates a folder like /tmp/pipeline_run_[filename]
231
- base_name = Path(processing_path).stem
232
- search_pattern = f"/tmp/pipeline_run_{base_name}*/*_raw_predictions.json"
233
- possible_files = glob.glob(search_pattern)
234
-
235
- if possible_files:
236
- debug_file = possible_files[0]
237
- print(f"πŸ” DEBUG: Found raw predictions at {debug_file}")
238
- with open(debug_file, 'r', encoding='utf-8') as f:
239
- raw_data = json.load(f)
240
-
241
- # Return the raw labels to the UI so you can see why it failed
242
- return (
243
- "⚠️ WARNING: BIO Decoding Failed (Step 3).\n"
244
- "Showing RAW LayoutLMv3 predictions instead for analysis:\n\n" +
245
- json.dumps(raw_data, indent=2, ensure_ascii=False),
246
- debug_file
247
- )
248
 
249
- return "❌ Error: Pipeline failed and no intermediate raw prediction file was found.", None
250
- # ------------------------------------------
 
 
 
251
 
252
- # 5. Prepare output (Successful Path)
253
- temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
254
- with open(temp_output.name, 'w', encoding='utf-8') as f:
255
- json.dump(result, f, indent=2, ensure_ascii=False)
256
-
257
- return json.dumps(result, indent=2, ensure_ascii=False), temp_output.name
258
 
259
  except Exception as e:
260
  import traceback
@@ -266,8 +256,9 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
266
  # ==============================
267
  with gr.Blocks(title="Document Analysis Pipeline") as demo:
268
 
269
- gr.Markdown("# πŸ“„ Document & Image Analysis Pipeline")
270
- gr.Markdown("### πŸ›  Debug Mode Active: If Step 3 fails, the Raw Prediction file will be returned.")
 
271
 
272
  with gr.Row():
273
  with gr.Column(scale=1):
@@ -283,11 +274,12 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
283
  value=DEFAULT_LAYOUTLMV3_MODEL_PATH
284
  )
285
 
286
- process_btn = gr.Button("πŸš€ Process Files", variant="primary")
287
 
288
  with gr.Column(scale=2):
289
- json_output = gr.Code(label="JSON Output (Structured or Raw Predictions)", language="json", lines=20)
290
- download_output = gr.File(label="Download JSON File")
 
291
 
292
  process_btn.click(
293
  fn=process_file,
@@ -296,8 +288,5 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
296
  )
297
 
298
  if __name__ == "__main__":
299
- # Note: 0.0.0.0 allows access from outside the container/host
300
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
301
 
302
-
303
-
 
139
 
140
 
141
 
 
142
  import gradio as gr
143
  import json
144
  import os
145
  import tempfile
146
  import img2pdf
147
  import glob
148
+ import shutil
149
  from img2pdf import Rotation
150
  from pathlib import Path
151
 
 
164
  def process_file(uploaded_files, layoutlmv3_model_path=None):
165
  """
166
  Robust handler for multiple or single file uploads.
167
+ Returns the final JSON and a LIST of all intermediate JSON files (OCR, Predictions, BIO).
 
168
  """
169
  if uploaded_files is None:
170
  return "❌ Error: No files uploaded.", None
171
 
 
 
 
172
  if not isinstance(uploaded_files, list):
173
  file_list = [uploaded_files]
174
  else:
 
176
 
177
  if len(file_list) == 0:
178
  return "❌ Error: Empty file list.", None
 
179
 
180
  # 1. Resolve all file paths safely
181
  resolved_paths = []
 
198
  is_image = first_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tiff']
199
 
200
  try:
 
201
  if len(resolved_paths) > 1 or is_image:
202
  print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
203
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
 
205
  f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
206
  processing_path = temp_pdf.name
207
  else:
 
208
  processing_path = resolved_paths[0]
209
 
210
  # 3. Standard Pipeline Checks
 
216
  print(f"πŸš€ Starting pipeline for: {processing_path}")
217
  result = run_document_pipeline(processing_path, final_model_path)
218
 
219
+ # 5. SCRAPE FOR INTERMEDIATE FILES
220
+ # We look for all .json files in /tmp/ created during this run
221
+ base_name = Path(processing_path).stem
222
+ # This matches common patterns like /tmp/pipeline_run_... or filenames in /tmp/
223
+ search_patterns = [
224
+ f"/tmp/pipeline_run_{base_name}*/*.json",
225
+ f"/tmp/*{base_name}*.json"
226
+ ]
227
+
228
+ all_intermediate_jsons = []
229
+ for pattern in search_patterns:
230
+ all_intermediate_jsons.extend(glob.glob(pattern))
231
+
232
+ # Remove duplicates while preserving order
233
+ all_intermediate_jsons = list(dict.fromkeys(all_intermediate_jsons))
234
+
235
+ # 6. Prepare Final Output for Display
236
  if result is None or (isinstance(result, list) and len(result) == 0):
237
+ display_text = "⚠️ Pipeline failed at Step 3 (BIO Decoding).\nDownload the intermediate JSONs below to inspect OCR and Model Predictions."
238
+ else:
239
+ display_text = json.dumps(result, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ # If the final result succeeded, save it to a temp file so it can be downloaded too
242
+ temp_final = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='final_result_')
243
+ json.dump(result, temp_final, indent=2, ensure_ascii=False)
244
+ temp_final.close()
245
+ all_intermediate_jsons.append(temp_final.name)
246
 
247
+ return display_text, all_intermediate_jsons
 
 
 
 
 
248
 
249
  except Exception as e:
250
  import traceback
 
256
  # ==============================
257
  with gr.Blocks(title="Document Analysis Pipeline") as demo:
258
 
259
+ gr.Markdown("# πŸ“„ Full Pipeline Analysis")
260
+ gr.Markdown("### πŸ” Intermediate File Recovery Active")
261
+ gr.Markdown("The **Download** box will contain: \n1. OCR JSON (Step 1)\n2. Raw LayoutLMv3 Prediction JSON (Step 2)\n3. Final BIO JSON (Step 3)")
262
 
263
  with gr.Row():
264
  with gr.Column(scale=1):
 
274
  value=DEFAULT_LAYOUTLMV3_MODEL_PATH
275
  )
276
 
277
+ process_btn = gr.Button("πŸš€ Run Pipeline", variant="primary")
278
 
279
  with gr.Column(scale=2):
280
+ json_output = gr.Code(label="Final Structured Output", language="json", lines=20)
281
+ # IMPORTANT: file_count="multiple" allows returning the list of all stage files
282
+ download_output = gr.File(label="Download All Pipeline Stages (JSON)", file_count="multiple")
283
 
284
  process_btn.click(
285
  fn=process_file,
 
288
  )
289
 
290
  if __name__ == "__main__":
 
291
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
292