heerjtdev commited on
Commit
1d00dbc
Β·
verified Β·
1 Parent(s): fd7d74f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -11
app.py CHANGED
@@ -2,11 +2,150 @@
2
 
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import gradio as gr
6
  import json
7
  import os
8
  import tempfile
9
  import img2pdf
 
10
  from img2pdf import Rotation
11
  from pathlib import Path
12
 
@@ -25,6 +164,8 @@ except ImportError:
25
  def process_file(uploaded_files, layoutlmv3_model_path=None):
26
  """
27
  Robust handler for multiple or single file uploads.
 
 
28
  """
29
  if uploaded_files is None:
30
  return "❌ Error: No files uploaded.", None
@@ -67,9 +208,7 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
67
  print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
68
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
69
  with open(temp_pdf.name, "wb") as f_out:
70
- # f_out.write(img2pdf.convert(resolved_paths))
71
  f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
72
-
73
  processing_path = temp_pdf.name
74
  else:
75
  # It's a single PDF
@@ -84,10 +223,33 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
84
  print(f"πŸš€ Starting pipeline for: {processing_path}")
85
  result = run_document_pipeline(processing_path, final_model_path)
86
 
87
- if result is None:
88
- return "❌ Error: Pipeline returned None.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # 5. Prepare output
91
  temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
92
  with open(temp_output.name, 'w', encoding='utf-8') as f:
93
  json.dump(result, f, indent=2, ensure_ascii=False)
@@ -105,14 +267,15 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
105
  with gr.Blocks(title="Document Analysis Pipeline") as demo:
106
 
107
  gr.Markdown("# πŸ“„ Document & Image Analysis Pipeline")
 
108
 
109
  with gr.Row():
110
  with gr.Column(scale=1):
111
  file_input = gr.File(
112
  label="Upload PDFs or Images",
113
  file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff"],
114
- file_count="multiple", # Keep this
115
- type="filepath" # Keep this
116
  )
117
 
118
  model_path_input = gr.Textbox(
@@ -123,8 +286,8 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
123
  process_btn = gr.Button("πŸš€ Process Files", variant="primary")
124
 
125
  with gr.Column(scale=2):
126
- json_output = gr.Code(label="JSON Output", language="json", lines=20)
127
- download_output = gr.File(label="Download JSON")
128
 
129
  process_btn.click(
130
  fn=process_file,
@@ -133,9 +296,8 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
133
  )
134
 
135
  if __name__ == "__main__":
 
136
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
137
 
138
 
139
 
140
-
141
-
 
2
 
3
 
4
 
5
+ # import gradio as gr
6
+ # import json
7
+ # import os
8
+ # import tempfile
9
+ # import img2pdf
10
+ # from img2pdf import Rotation
11
+ # from pathlib import Path
12
+
13
+ # # ==============================
14
+ # # PIPELINE IMPORT
15
+ # # ==============================
16
+ # try:
17
+ # from working_yolo_pipeline import run_document_pipeline, DEFAULT_LAYOUTLMV3_MODEL_PATH, WEIGHTS_PATH
18
+ # except ImportError:
19
+ # print("Warning: 'working_yolo_pipeline.py' not found. Using dummy paths.")
20
+ # def run_document_pipeline(*args):
21
+ # return {"error": "Placeholder pipeline function called."}
22
+ # DEFAULT_LAYOUTLMV3_MODEL_PATH = "./models/layoutlmv3_model"
23
+ # WEIGHTS_PATH = "./weights/yolo_weights.pt"
24
+
25
+ # def process_file(uploaded_files, layoutlmv3_model_path=None):
26
+ # """
27
+ # Robust handler for multiple or single file uploads.
28
+ # """
29
+ # if uploaded_files is None:
30
+ # return "❌ Error: No files uploaded.", None
31
+
32
+ # # --- THE ROBUST FIX ---
33
+ # # Gradio sometimes sends a single dict even when set to multiple.
34
+ # # We force everything into a list so the rest of the logic doesn't break.
35
+ # if not isinstance(uploaded_files, list):
36
+ # file_list = [uploaded_files]
37
+ # else:
38
+ # file_list = uploaded_files
39
+
40
+ # if len(file_list) == 0:
41
+ # return "❌ Error: Empty file list.", None
42
+ # # ----------------------
43
+
44
+ # # 1. Resolve all file paths safely
45
+ # resolved_paths = []
46
+ # for f in file_list:
47
+ # try:
48
+ # if isinstance(f, dict) and "path" in f:
49
+ # resolved_paths.append(f["path"])
50
+ # elif hasattr(f, 'path'):
51
+ # resolved_paths.append(f.path)
52
+ # else:
53
+ # resolved_paths.append(str(f))
54
+ # except Exception as e:
55
+ # print(f"Error resolving path for {f}: {e}")
56
+
57
+ # if not resolved_paths:
58
+ # return "❌ Error: Could not resolve file paths.", None
59
+
60
+ # # 2. Determine if we should merge into a single PDF
61
+ # first_file = Path(resolved_paths[0])
62
+ # is_image = first_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tiff']
63
+
64
+ # try:
65
+ # # If it's multiple files or just one image, wrap it in a PDF
66
+ # if len(resolved_paths) > 1 or is_image:
67
+ # print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
68
+ # temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
69
+ # with open(temp_pdf.name, "wb") as f_out:
70
+ # # f_out.write(img2pdf.convert(resolved_paths))
71
+ # f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
72
+
73
+ # processing_path = temp_pdf.name
74
+ # else:
75
+ # # It's a single PDF
76
+ # processing_path = resolved_paths[0]
77
+
78
+ # # 3. Standard Pipeline Checks
79
+ # final_model_path = layoutlmv3_model_path or DEFAULT_LAYOUTLMV3_MODEL_PATH
80
+ # if not os.path.exists(final_model_path):
81
+ # return f"❌ Error: Model not found at {final_model_path}", None
82
+
83
+ # # 4. Call the pipeline
84
+ # print(f"πŸš€ Starting pipeline for: {processing_path}")
85
+ # result = run_document_pipeline(processing_path, final_model_path)
86
+
87
+ # if result is None:
88
+ # return "❌ Error: Pipeline returned None.", None
89
+
90
+ # # 5. Prepare output
91
+ # temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
92
+ # with open(temp_output.name, 'w', encoding='utf-8') as f:
93
+ # json.dump(result, f, indent=2, ensure_ascii=False)
94
+
95
+ # return json.dumps(result, indent=2, ensure_ascii=False), temp_output.name
96
+
97
+ # except Exception as e:
98
+ # import traceback
99
+ # traceback.print_exc()
100
+ # return f"❌ Error: {str(e)}", None
101
+
102
+ # # ==============================
103
+ # # GRADIO INTERFACE
104
+ # # ==============================
105
+ # with gr.Blocks(title="Document Analysis Pipeline") as demo:
106
+
107
+ # gr.Markdown("# πŸ“„ Document & Image Analysis Pipeline")
108
+
109
+ # with gr.Row():
110
+ # with gr.Column(scale=1):
111
+ # file_input = gr.File(
112
+ # label="Upload PDFs or Images",
113
+ # file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff"],
114
+ # file_count="multiple", # Keep this
115
+ # type="filepath" # Keep this
116
+ # )
117
+
118
+ # model_path_input = gr.Textbox(
119
+ # label="Model Path",
120
+ # value=DEFAULT_LAYOUTLMV3_MODEL_PATH
121
+ # )
122
+
123
+ # process_btn = gr.Button("πŸš€ Process Files", variant="primary")
124
+
125
+ # with gr.Column(scale=2):
126
+ # json_output = gr.Code(label="JSON Output", language="json", lines=20)
127
+ # download_output = gr.File(label="Download JSON")
128
+
129
+ # process_btn.click(
130
+ # fn=process_file,
131
+ # inputs=[file_input, model_path_input],
132
+ # outputs=[json_output, download_output]
133
+ # )
134
+
135
+ # if __name__ == "__main__":
136
+ # demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
137
+
138
+
139
+
140
+
141
+
142
+
143
  import gradio as gr
144
  import json
145
  import os
146
  import tempfile
147
  import img2pdf
148
+ import glob
149
  from img2pdf import Rotation
150
  from pathlib import Path
151
 
 
164
  def process_file(uploaded_files, layoutlmv3_model_path=None):
165
  """
166
  Robust handler for multiple or single file uploads.
167
+ Returns the final JSON and the file path for download.
168
+ If the pipeline fails at BIO conversion, it attempts to return the raw predictions for debugging.
169
  """
170
  if uploaded_files is None:
171
  return "❌ Error: No files uploaded.", None
 
208
  print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
209
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
210
  with open(temp_pdf.name, "wb") as f_out:
 
211
  f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
 
212
  processing_path = temp_pdf.name
213
  else:
214
  # It's a single PDF
 
223
  print(f"πŸš€ Starting pipeline for: {processing_path}")
224
  result = run_document_pipeline(processing_path, final_model_path)
225
 
226
+ # --- DEBUGGING LOGIC FOR STEP 3 FAILURE ---
227
+ if result is None or (isinstance(result, list) and len(result) == 0):
228
+ print("⚠️ Pipeline returned no structured data. Looking for raw predictions for debugging...")
229
+
230
+ # Based on your logs, the pipeline creates a folder like /tmp/pipeline_run_[filename]
231
+ base_name = Path(processing_path).stem
232
+ search_pattern = f"/tmp/pipeline_run_{base_name}*/*_raw_predictions.json"
233
+ possible_files = glob.glob(search_pattern)
234
+
235
+ if possible_files:
236
+ debug_file = possible_files[0]
237
+ print(f"πŸ” DEBUG: Found raw predictions at {debug_file}")
238
+ with open(debug_file, 'r', encoding='utf-8') as f:
239
+ raw_data = json.load(f)
240
+
241
+ # Return the raw labels to the UI so you can see why it failed
242
+ return (
243
+ "⚠️ WARNING: BIO Decoding Failed (Step 3).\n"
244
+ "Showing RAW LayoutLMv3 predictions instead for analysis:\n\n" +
245
+ json.dumps(raw_data, indent=2, ensure_ascii=False),
246
+ debug_file
247
+ )
248
+
249
+ return "❌ Error: Pipeline failed and no intermediate raw prediction file was found.", None
250
+ # ------------------------------------------
251
 
252
+ # 5. Prepare output (Successful Path)
253
  temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
254
  with open(temp_output.name, 'w', encoding='utf-8') as f:
255
  json.dump(result, f, indent=2, ensure_ascii=False)
 
267
  with gr.Blocks(title="Document Analysis Pipeline") as demo:
268
 
269
  gr.Markdown("# πŸ“„ Document & Image Analysis Pipeline")
270
+ gr.Markdown("### πŸ›  Debug Mode Active: If Step 3 fails, the Raw Prediction file will be returned.")
271
 
272
  with gr.Row():
273
  with gr.Column(scale=1):
274
  file_input = gr.File(
275
  label="Upload PDFs or Images",
276
  file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff"],
277
+ file_count="multiple",
278
+ type="filepath"
279
  )
280
 
281
  model_path_input = gr.Textbox(
 
286
  process_btn = gr.Button("πŸš€ Process Files", variant="primary")
287
 
288
  with gr.Column(scale=2):
289
+ json_output = gr.Code(label="JSON Output (Structured or Raw Predictions)", language="json", lines=20)
290
+ download_output = gr.File(label="Download JSON File")
291
 
292
  process_btn.click(
293
  fn=process_file,
 
296
  )
297
 
298
  if __name__ == "__main__":
299
+ # Note: 0.0.0.0 allows access from outside the container/host
300
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
301
 
302
 
303