heerjtdev commited on
Commit
78471d0
Β·
verified Β·
1 Parent(s): fa67ff4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -176
app.py CHANGED
@@ -2,152 +2,11 @@
2
 
3
 
4
 
5
- # import gradio as gr
6
- # import json
7
- # import os
8
- # import tempfile
9
- # import img2pdf
10
- # from img2pdf import Rotation
11
- # from pathlib import Path
12
-
13
- # # ==============================
14
- # # PIPELINE IMPORT
15
- # # ==============================
16
- # try:
17
- # from working_yolo_pipeline import run_document_pipeline, DEFAULT_LAYOUTLMV3_MODEL_PATH, WEIGHTS_PATH
18
- # except ImportError:
19
- # print("Warning: 'working_yolo_pipeline.py' not found. Using dummy paths.")
20
- # def run_document_pipeline(*args):
21
- # return {"error": "Placeholder pipeline function called."}
22
- # DEFAULT_LAYOUTLMV3_MODEL_PATH = "./models/layoutlmv3_model"
23
- # WEIGHTS_PATH = "./weights/yolo_weights.pt"
24
-
25
- # def process_file(uploaded_files, layoutlmv3_model_path=None):
26
- # """
27
- # Robust handler for multiple or single file uploads.
28
- # """
29
- # if uploaded_files is None:
30
- # return "❌ Error: No files uploaded.", None
31
-
32
- # # --- THE ROBUST FIX ---
33
- # # Gradio sometimes sends a single dict even when set to multiple.
34
- # # We force everything into a list so the rest of the logic doesn't break.
35
- # if not isinstance(uploaded_files, list):
36
- # file_list = [uploaded_files]
37
- # else:
38
- # file_list = uploaded_files
39
-
40
- # if len(file_list) == 0:
41
- # return "❌ Error: Empty file list.", None
42
- # # ----------------------
43
-
44
- # # 1. Resolve all file paths safely
45
- # resolved_paths = []
46
- # for f in file_list:
47
- # try:
48
- # if isinstance(f, dict) and "path" in f:
49
- # resolved_paths.append(f["path"])
50
- # elif hasattr(f, 'path'):
51
- # resolved_paths.append(f.path)
52
- # else:
53
- # resolved_paths.append(str(f))
54
- # except Exception as e:
55
- # print(f"Error resolving path for {f}: {e}")
56
-
57
- # if not resolved_paths:
58
- # return "❌ Error: Could not resolve file paths.", None
59
-
60
- # # 2. Determine if we should merge into a single PDF
61
- # first_file = Path(resolved_paths[0])
62
- # is_image = first_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tiff']
63
-
64
- # try:
65
- # # If it's multiple files or just one image, wrap it in a PDF
66
- # if len(resolved_paths) > 1 or is_image:
67
- # print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
68
- # temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
69
- # with open(temp_pdf.name, "wb") as f_out:
70
- # # f_out.write(img2pdf.convert(resolved_paths))
71
- # f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
72
-
73
- # processing_path = temp_pdf.name
74
- # else:
75
- # # It's a single PDF
76
- # processing_path = resolved_paths[0]
77
-
78
- # # 3. Standard Pipeline Checks
79
- # final_model_path = layoutlmv3_model_path or DEFAULT_LAYOUTLMV3_MODEL_PATH
80
- # if not os.path.exists(final_model_path):
81
- # return f"❌ Error: Model not found at {final_model_path}", None
82
-
83
- # # 4. Call the pipeline
84
- # print(f"πŸš€ Starting pipeline for: {processing_path}")
85
- # result = run_document_pipeline(processing_path, final_model_path)
86
-
87
- # if result is None:
88
- # return "❌ Error: Pipeline returned None.", None
89
-
90
- # # 5. Prepare output
91
- # temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
92
- # with open(temp_output.name, 'w', encoding='utf-8') as f:
93
- # json.dump(result, f, indent=2, ensure_ascii=False)
94
-
95
- # return json.dumps(result, indent=2, ensure_ascii=False), temp_output.name
96
-
97
- # except Exception as e:
98
- # import traceback
99
- # traceback.print_exc()
100
- # return f"❌ Error: {str(e)}", None
101
-
102
- # # ==============================
103
- # # GRADIO INTERFACE
104
- # # ==============================
105
- # with gr.Blocks(title="Document Analysis Pipeline") as demo:
106
-
107
- # gr.Markdown("# πŸ“„ Document & Image Analysis Pipeline")
108
-
109
- # with gr.Row():
110
- # with gr.Column(scale=1):
111
- # file_input = gr.File(
112
- # label="Upload PDFs or Images",
113
- # file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff"],
114
- # file_count="multiple", # Keep this
115
- # type="filepath" # Keep this
116
- # )
117
-
118
- # model_path_input = gr.Textbox(
119
- # label="Model Path",
120
- # value=DEFAULT_LAYOUTLMV3_MODEL_PATH
121
- # )
122
-
123
- # process_btn = gr.Button("πŸš€ Process Files", variant="primary")
124
-
125
- # with gr.Column(scale=2):
126
- # json_output = gr.Code(label="JSON Output", language="json", lines=20)
127
- # download_output = gr.File(label="Download JSON")
128
-
129
- # process_btn.click(
130
- # fn=process_file,
131
- # inputs=[file_input, model_path_input],
132
- # outputs=[json_output, download_output]
133
- # )
134
-
135
- # if __name__ == "__main__":
136
- # demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
137
-
138
-
139
-
140
-
141
-
142
-
143
-
144
-
145
  import gradio as gr
146
  import json
147
  import os
148
  import tempfile
149
  import img2pdf
150
- import glob
151
  from img2pdf import Rotation
152
  from pathlib import Path
153
 
@@ -166,8 +25,6 @@ except ImportError:
166
  def process_file(uploaded_files, layoutlmv3_model_path=None):
167
  """
168
  Robust handler for multiple or single file uploads.
169
- Returns the final JSON and the file path for download.
170
- If the pipeline fails at BIO conversion, it attempts to return the raw predictions for debugging.
171
  """
172
  if uploaded_files is None:
173
  return "❌ Error: No files uploaded.", None
@@ -210,7 +67,9 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
210
  print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
211
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
212
  with open(temp_pdf.name, "wb") as f_out:
 
213
  f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
 
214
  processing_path = temp_pdf.name
215
  else:
216
  # It's a single PDF
@@ -225,33 +84,10 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
225
  print(f"πŸš€ Starting pipeline for: {processing_path}")
226
  result = run_document_pipeline(processing_path, final_model_path)
227
 
228
- # --- DEBUGGING LOGIC FOR STEP 3 FAILURE ---
229
- if result is None or (isinstance(result, list) and len(result) == 0):
230
- print("⚠️ Pipeline returned no structured data. Looking for raw predictions for debugging...")
231
-
232
- # Based on your logs, the pipeline creates a folder like /tmp/pipeline_run_[filename]
233
- base_name = Path(processing_path).stem
234
- search_pattern = f"/tmp/pipeline_run_{base_name}*/*_raw_predictions.json"
235
- possible_files = glob.glob(search_pattern)
236
 
237
- if possible_files:
238
- debug_file = possible_files[0]
239
- print(f"πŸ” DEBUG: Found raw predictions at {debug_file}")
240
- with open(debug_file, 'r', encoding='utf-8') as f:
241
- raw_data = json.load(f)
242
-
243
- # Return the raw labels to the UI so you can see why it failed
244
- return (
245
- "⚠️ WARNING: BIO Decoding Failed (Step 3).\n"
246
- "Showing RAW LayoutLMv3 predictions instead for analysis:\n\n" +
247
- json.dumps(raw_data, indent=2, ensure_ascii=False),
248
- debug_file
249
- )
250
-
251
- return "❌ Error: Pipeline failed and no intermediate raw prediction file was found.", None
252
- # ------------------------------------------
253
-
254
- # 5. Prepare output (Successful Path)
255
  temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
256
  with open(temp_output.name, 'w', encoding='utf-8') as f:
257
  json.dump(result, f, indent=2, ensure_ascii=False)
@@ -269,15 +105,14 @@ def process_file(uploaded_files, layoutlmv3_model_path=None):
269
  with gr.Blocks(title="Document Analysis Pipeline") as demo:
270
 
271
  gr.Markdown("# πŸ“„ Document & Image Analysis Pipeline")
272
- gr.Markdown("### πŸ›  Debug Mode Active: If Step 3 fails, the Raw Prediction file will be returned.")
273
 
274
  with gr.Row():
275
  with gr.Column(scale=1):
276
  file_input = gr.File(
277
  label="Upload PDFs or Images",
278
  file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff"],
279
- file_count="multiple",
280
- type="filepath"
281
  )
282
 
283
  model_path_input = gr.Textbox(
@@ -288,8 +123,8 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
288
  process_btn = gr.Button("πŸš€ Process Files", variant="primary")
289
 
290
  with gr.Column(scale=2):
291
- json_output = gr.Code(label="JSON Output (Structured or Raw Predictions)", language="json", lines=20)
292
- download_output = gr.File(label="Download JSON File")
293
 
294
  process_btn.click(
295
  fn=process_file,
@@ -298,5 +133,9 @@ with gr.Blocks(title="Document Analysis Pipeline") as demo:
298
  )
299
 
300
  if __name__ == "__main__":
301
- # Note: 0.0.0.0 allows access from outside the container/host
302
- demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
 
 
 
 
 
2
 
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import gradio as gr
6
  import json
7
  import os
8
  import tempfile
9
  import img2pdf
 
10
  from img2pdf import Rotation
11
  from pathlib import Path
12
 
 
25
  def process_file(uploaded_files, layoutlmv3_model_path=None):
26
  """
27
  Robust handler for multiple or single file uploads.
 
 
28
  """
29
  if uploaded_files is None:
30
  return "❌ Error: No files uploaded.", None
 
67
  print(f"πŸ“¦ Converting {len(resolved_paths)} image(s) to a single PDF...")
68
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
69
  with open(temp_pdf.name, "wb") as f_out:
70
+ # f_out.write(img2pdf.convert(resolved_paths))
71
  f_out.write(img2pdf.convert(resolved_paths, rotation=Rotation.ifvalid))
72
+
73
  processing_path = temp_pdf.name
74
  else:
75
  # It's a single PDF
 
84
  print(f"πŸš€ Starting pipeline for: {processing_path}")
85
  result = run_document_pipeline(processing_path, final_model_path)
86
 
87
+ if result is None:
88
+ return "❌ Error: Pipeline returned None.", None
 
 
 
 
 
 
89
 
90
+ # 5. Prepare output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
92
  with open(temp_output.name, 'w', encoding='utf-8') as f:
93
  json.dump(result, f, indent=2, ensure_ascii=False)
 
105
  with gr.Blocks(title="Document Analysis Pipeline") as demo:
106
 
107
  gr.Markdown("# πŸ“„ Document & Image Analysis Pipeline")
 
108
 
109
  with gr.Row():
110
  with gr.Column(scale=1):
111
  file_input = gr.File(
112
  label="Upload PDFs or Images",
113
  file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff"],
114
+ file_count="multiple", # Keep this
115
+ type="filepath" # Keep this
116
  )
117
 
118
  model_path_input = gr.Textbox(
 
123
  process_btn = gr.Button("πŸš€ Process Files", variant="primary")
124
 
125
  with gr.Column(scale=2):
126
+ json_output = gr.Code(label="JSON Output", language="json", lines=20)
127
+ download_output = gr.File(label="Download JSON")
128
 
129
  process_btn.click(
130
  fn=process_file,
 
133
  )
134
 
135
  if __name__ == "__main__":
136
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
137
+
138
+
139
+
140
+
141
+