mlbench123 commited on
Commit
e277539
Β·
verified Β·
1 Parent(s): 13f5bf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -296
app.py CHANGED
@@ -7,22 +7,19 @@ import traceback
7
 
8
  from PIL import Image
9
  import PyPDF2
10
-
11
- # Open-source OCR + PDF rendering
12
  import pytesseract
13
  from pdf2image import convert_from_path
14
-
15
- # Open-source model inference via Hugging Face
16
  from huggingface_hub import InferenceClient
17
 
18
 
19
  # ==============================================================
20
- # Extraction prompt (JSON schema)
21
  # ==============================================================
 
22
  EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
23
- You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
24
 
25
- Extract and structure the data as valid JSON only (no markdown, no commentary):
26
 
27
  {
28
  "poNumber": string | null,
@@ -59,379 +56,209 @@ Extract and structure the data as valid JSON only (no markdown, no commentary):
59
  }
60
  }
61
 
62
- EXTRACTION RULES:
63
- 1. Extract ALL product line items - create one inventory item per product line
64
- 2. Parse dimensions: "2X6X14" β†’ pcsHeight=2, pcsWidth=6, pcsLength=14 (numbers only)
65
- 3. BF = totalQuantity (if total board-feet is present)
66
- 4. Convert BF to MBF: BF Γ· 1000
67
- 5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
68
- 6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
69
- 7. If multiple documents, consolidate all items into one JSON
70
- 8. Return null for missing fields
71
- 9. attachments should list all provided filenames
72
-
73
- Return ONLY valid JSON matching this exact structure."""
74
 
75
 
76
  # ==============================================================
77
- # Utilities: JSON extraction/cleaning
78
  # ==============================================================
79
 
80
- def _strip_code_fences(s: str) -> str:
81
- s = (s or "").strip()
82
- if s.startswith("```"):
83
- # remove opening fence line (optionally "```json")
84
- parts = s.split("\n", 1)
85
- if len(parts) == 2:
86
- s = parts[1]
87
- else:
88
- s = s.replace("```", "", 1)
89
- if s.endswith("```"):
90
- s = s[:-3]
91
- return s.strip()
92
 
 
 
 
93
 
94
- def _extract_first_json_object(s: str) -> str:
95
- """
96
- Pull the first JSON object from a model response, even if extra text exists.
97
- """
98
- s = _strip_code_fences(s)
99
 
100
- start = s.find("{")
101
- end = s.rfind("}")
102
- if start == -1 or end == -1 or end <= start:
103
- raise json.JSONDecodeError("No JSON object found in response", s, 0)
104
- return s[start:end + 1].strip()
105
 
106
 
107
  # ==============================================================
108
- # Text extraction: PDFs, images, docs
109
  # ==============================================================
110
 
111
  def extract_text_from_pdf(pdf_path: str) -> str:
112
- """Extract embedded text from PDF (works for text-based PDFs)."""
113
  try:
114
- with open(pdf_path, "rb") as file:
115
- pdf_reader = PyPDF2.PdfReader(file)
116
  text = ""
117
- for page_num, page in enumerate(pdf_reader.pages):
118
- page_text = page.extract_text()
119
- if page_text:
120
- text += f"\n--- Page {page_num + 1} (PDF text) ---\n{page_text}"
121
- return text.strip()
122
  except Exception as e:
123
- return f"Error extracting PDF text: {str(e)}"
124
 
125
 
126
- def ocr_image(image: Image.Image) -> str:
127
- """OCR a PIL image using Tesseract."""
128
- try:
129
- if image.mode != "RGB":
130
- image = image.convert("RGB")
131
- return pytesseract.image_to_string(image)
132
- except Exception as e:
133
- return f"Error performing OCR on image: {str(e)}"
134
 
135
 
136
- def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
137
- """
138
- Extract text from PDF:
139
- 1) Try embedded text via PyPDF2
140
- 2) If empty/insufficient, render pages and OCR
141
- """
142
- embedded = extract_text_from_pdf(pdf_path)
143
- if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
144
- return embedded
145
 
146
- try:
147
- pages = convert_from_path(pdf_path, dpi=dpi)
148
- ocr_chunks = []
149
- for i, page_img in enumerate(pages):
150
- page_text = ocr_image(page_img)
151
- ocr_chunks.append(f"\n--- Page {i+1} (OCR) ---\n{page_text}")
152
- merged = "\n".join(ocr_chunks).strip()
153
- return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
154
- except Exception as e:
155
- return (
156
- f"Error rendering PDF for OCR: {str(e)}\n"
157
- f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
158
- )
159
 
 
 
 
 
160
 
161
- def extract_text_from_docx(docx_path: str) -> str:
162
- try:
163
- import docx
164
- doc = docx.Document(docx_path)
165
- text = "\n".join([p.text for p in doc.paragraphs if p.text])
166
- return text.strip()
167
- except Exception as e:
168
- return f"Error reading Word doc: {str(e)}"
169
 
170
 
171
- def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
172
- """Process files locally (no Gemini)."""
173
- processed_data = {
174
  "text_content": "",
175
- "attachments": [],
176
- "file_info": [],
177
  }
178
 
179
- if not files:
180
- return processed_data
 
181
 
182
- for file_path in files:
183
- if not os.path.exists(file_path):
184
- continue
185
 
186
- file_name = Path(file_path).name
187
- file_ext = Path(file_path).suffix.lower()
188
 
189
- processed_data["attachments"].append(file_name)
190
- processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
 
191
 
192
- try:
193
- if file_ext == ".pdf":
194
- text = extract_text_from_pdf_with_ocr(file_path)
195
- processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
196
 
197
- elif file_ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]:
198
- img = Image.open(file_path)
199
- text = ocr_image(img)
200
- processed_data["text_content"] += f"\n\n=== {file_name} (OCR) ===\n{text}"
201
 
202
- elif file_ext in [".txt", ".csv"]:
203
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
204
- processed_data["text_content"] += f"\n\n=== {file_name} ===\n{f.read()}"
205
-
206
- elif file_ext in [".doc", ".docx"]:
207
- text = extract_text_from_docx(file_path)
208
- processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
209
-
210
- else:
211
- processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Unsupported file type: {file_ext}]"
212
 
213
- except Exception as e:
214
- processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
215
 
216
- return processed_data
217
 
218
 
219
  # ==============================================================
220
- # Open-source model extraction via Hugging Face Inference API
221
- # - Tries chat endpoint
222
- # - If model isn't chat-compatible, falls back to text generation endpoint
223
  # ==============================================================
224
 
225
- def extract_with_hf_llm(
226
- processed_data: Dict[str, Any],
227
- model_id: Optional[str] = None,
228
- ) -> Dict[str, Any]:
229
- hf_token = os.getenv("HF_TOKEN", "").strip() or None
230
- model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
231
 
232
- client = InferenceClient(model=model_id, token=hf_token)
233
 
234
  prompt = (
235
  EXTRACTION_PROMPT
236
- + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
237
- + (processed_data.get("text_content", "") or "")
238
  + "\n\nATTACHMENTS:\n"
239
- + json.dumps(processed_data.get("attachments", []))
240
- + "\n\nReturn ONLY valid JSON."
241
  )
242
 
243
  raw = ""
244
- try:
245
- # Try chat-completions first (works for chat-enabled models)
246
- resp = client.chat_completion(
247
- messages=[
248
- {"role": "system", "content": "You extract structured data and return strict JSON only."},
249
- {"role": "user", "content": prompt},
250
- ],
251
- temperature=0.1,
252
- max_tokens=3000,
253
- )
254
- raw = (resp.choices[0].message.content or "").strip()
255
 
256
- except Exception as e:
257
- # If model is not chat-compatible, fall back to text generation
258
- msg = str(e)
259
- is_not_chat = ("not a chat model" in msg.lower()) or ("model_not_supported" in msg.lower())
260
-
261
- if not is_not_chat:
262
- return {
263
- "success": False,
264
- "error": f"Extraction error: {msg}",
265
- "traceback": traceback.format_exc(),
266
  }
 
 
267
 
 
268
  try:
269
- gen = client.text_generation(
270
- prompt,
 
 
 
 
271
  temperature=0.1,
272
- max_new_tokens=3000,
273
- return_full_text=False,
274
  )
275
- raw = (gen or "").strip()
 
276
  except Exception as e2:
277
  return {
278
  "success": False,
279
- "error": f"Text-generation fallback failed: {str(e2)}",
280
- "traceback": traceback.format_exc(),
281
  }
282
 
283
- # Parse JSON robustly
284
  try:
285
- json_text = _extract_first_json_object(raw)
286
- extracted_data = json.loads(json_text)
287
  return {
288
  "success": True,
289
- "data": extracted_data,
290
- "raw_response": raw,
291
- "model": model_id,
292
  }
293
- except json.JSONDecodeError as je:
294
  return {
295
  "success": False,
296
- "error": f"JSON parsing error: {str(je)}",
297
- "raw_response": raw,
298
- "suggestion": (
299
- "Model returned non-JSON or malformed JSON. "
300
- "Try another HF_MODEL (e.g., Qwen/Qwen2.5-7B-Instruct), or reduce max_new_tokens."
301
- ),
302
  }
303
 
304
 
305
  # ==============================================================
306
- # Main Gradio function
307
  # ==============================================================
308
 
309
  def process_documents(files):
310
- if not files or len(files) == 0:
311
- return "❌ Error: Please upload at least one file", "{}", "No files provided"
312
-
313
- try:
314
- file_paths = [f.name if hasattr(f, "name") else f for f in files]
315
- status_msg = f"πŸ“„ Processing {len(file_paths)} file(s)...\n"
316
-
317
- # Local extraction (PDF text + OCR)
318
- processed_data = process_files_for_extraction(file_paths)
319
- status_msg += f"βœ“ Files loaded: {', '.join(processed_data['attachments'])}\n"
320
- status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
321
-
322
- txt = (processed_data.get("text_content") or "").strip()
323
- if len(txt) < 30:
324
- msg = (
325
- "❌ No usable text could be extracted.\n"
326
- "If PDFs are scanned, ensure OCR dependencies are installed (tesseract-ocr + poppler-utils).\n"
327
- )
328
- return msg, "{}", msg
329
 
330
- # LLM structuring
331
- status_msg += "πŸ€– Structuring to JSON with open-source model (HF Inference API)...\n"
332
- result = extract_with_hf_llm(processed_data)
333
 
334
- if result.get("success"):
335
- json_output = json.dumps(result["data"], indent=2)
336
- status_msg += f"βœ… Extraction successful! Model: {result.get('model')}\n"
337
- display_text = "=== EXTRACTED DATA ===\n\n" + json_output
338
- return status_msg, json_output, display_text
339
 
340
- # Failure case
341
- error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
342
- if "suggestion" in result:
343
- error_msg += f"\nπŸ’‘ {result['suggestion']}\n"
344
- if "traceback" in result:
345
- error_msg += f"\nDebug info:\n{result['traceback'][:1200]}\n"
346
 
347
- raw_resp = result.get("raw_response", "No response")
348
- return error_msg, "{}", f"Raw Response:\n{raw_resp[:2000]}"
 
349
 
350
- except Exception as e:
351
- error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:1200]}"
352
- return error_msg, "{}", error_msg
353
 
354
 
355
  # ==============================================================
356
- # Gradio Interface
357
  # ==============================================================
358
 
359
- def create_interface():
360
- with gr.Blocks(theme=gr.themes.Soft(), title="Shipping Document Data Extractor") as demo:
361
- gr.Markdown("""
362
- # πŸ“„ Shipping Document Data Extractor
363
-
364
- Upload PDFs, images, Word docs, or text files to extract structured shipping data.
365
-
366
- **Pipeline:** Local OCR/Text extraction β†’ Open-source LLM (HF Inference API) β†’ JSON
367
- **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
368
- """)
369
-
370
- with gr.Row():
371
- with gr.Column(scale=2):
372
- file_input = gr.File(
373
- label="πŸ“Ž Upload Documents",
374
- file_count="multiple",
375
- file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
376
- )
377
-
378
- gr.Markdown("**Try with example:**")
379
- example_btn = gr.Button("πŸ“„ Load Example PDF", size="sm", variant="secondary")
380
-
381
- submit_btn = gr.Button("πŸš€ Extract Data", variant="primary", size="lg")
382
-
383
- with gr.Column(scale=3):
384
- status_output = gr.Textbox(
385
- label="πŸ“Š Status",
386
- lines=4,
387
- max_lines=8
388
- )
389
-
390
- json_output = gr.Code(
391
- label="πŸ“‹ JSON Output (Copy this)",
392
- language="json",
393
- lines=15
394
- )
395
-
396
- display_output = gr.Textbox(
397
- label="πŸ‘οΈ Preview",
398
- lines=10,
399
- max_lines=15
400
- )
401
-
402
- gr.Markdown("""
403
- ### πŸ’‘ Notes
404
- - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
405
- - For better throughput, set **HF_TOKEN** in Space Secrets.
406
- - Switch models by setting **HF_MODEL** (e.g., `Qwen/Qwen2.5-7B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`).
407
- """)
408
-
409
- submit_btn.click(
410
- fn=process_documents,
411
- inputs=[file_input],
412
- outputs=[status_output, json_output, display_output]
413
- )
414
 
415
- def load_example():
416
- # In Spaces, example file should be in repo root
417
- example_path = "example1.pdf"
418
- if os.path.exists(example_path):
419
- return [example_path]
420
- return []
421
-
422
- example_btn.click(
423
- fn=load_example,
424
- inputs=None,
425
- outputs=file_input
426
- )
427
-
428
- return demo
429
 
430
-
431
- if __name__ == "__main__":
432
- demo = create_interface()
433
- demo.launch(
434
- server_name="0.0.0.0",
435
- server_port=7860,
436
- share=False
437
  )
 
 
 
7
 
8
  from PIL import Image
9
  import PyPDF2
 
 
10
  import pytesseract
11
  from pdf2image import convert_from_path
 
 
12
  from huggingface_hub import InferenceClient
13
 
14
 
15
  # ==============================================================
16
+ # Extraction prompt
17
  # ==============================================================
18
+
19
  EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
20
+ You will be given OCR/text extracted from shipping documents.
21
 
22
+ Extract and return ONLY valid JSON matching this schema:
23
 
24
  {
25
  "poNumber": string | null,
 
56
  }
57
  }
58
 
59
+ Return ONLY JSON. No explanation.
60
+ """
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  # ==============================================================
64
+ # JSON Helpers
65
  # ==============================================================
66
 
67
+ def extract_json(text: str) -> Dict:
68
+ text = text.strip()
 
 
 
 
 
 
 
 
 
 
69
 
70
+ if text.startswith("```"):
71
+ text = text.split("\n", 1)[-1]
72
+ text = text.replace("```", "").strip()
73
 
74
+ start = text.find("{")
75
+ end = text.rfind("}")
 
 
 
76
 
77
+ if start == -1 or end == -1:
78
+ raise json.JSONDecodeError("No JSON found", text, 0)
79
+
80
+ return json.loads(text[start:end+1])
 
81
 
82
 
83
  # ==============================================================
84
+ # OCR + TEXT EXTRACTION
85
  # ==============================================================
86
 
87
  def extract_text_from_pdf(pdf_path: str) -> str:
 
88
  try:
89
+ with open(pdf_path, "rb") as f:
90
+ reader = PyPDF2.PdfReader(f)
91
  text = ""
92
+ for page in reader.pages:
93
+ t = page.extract_text()
94
+ if t:
95
+ text += t + "\n"
96
+ return text
97
  except Exception as e:
98
+ return f"PDF text error: {e}"
99
 
100
 
101
+ def ocr_image(img: Image.Image) -> str:
102
+ if img.mode != "RGB":
103
+ img = img.convert("RGB")
104
+ return pytesseract.image_to_string(img)
 
 
 
 
105
 
106
 
107
+ def extract_pdf_with_ocr(pdf_path: str) -> str:
108
+ text = extract_text_from_pdf(pdf_path)
 
 
 
 
 
 
 
109
 
110
+ if text and len(text) > 50:
111
+ return text
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ pages = convert_from_path(pdf_path, dpi=250)
114
+ ocr_text = ""
115
+ for p in pages:
116
+ ocr_text += ocr_image(p) + "\n"
117
 
118
+ return ocr_text
 
 
 
 
 
 
 
119
 
120
 
121
+ def process_files(files: List[str]) -> Dict[str, Any]:
122
+ result = {
 
123
  "text_content": "",
124
+ "attachments": []
 
125
  }
126
 
127
+ for f in files:
128
+ name = Path(f).name
129
+ ext = Path(f).suffix.lower()
130
 
131
+ result["attachments"].append(name)
 
 
132
 
133
+ if ext == ".pdf":
134
+ text = extract_pdf_with_ocr(f)
135
 
136
+ elif ext in [".jpg", ".jpeg", ".png", ".webp"]:
137
+ img = Image.open(f)
138
+ text = ocr_image(img)
139
 
140
+ elif ext in [".txt", ".csv"]:
141
+ text = open(f, encoding="utf-8", errors="ignore").read()
 
 
142
 
143
+ elif ext in [".doc", ".docx"]:
144
+ import docx
145
+ doc = docx.Document(f)
146
+ text = "\n".join([p.text for p in doc.paragraphs])
147
 
148
+ else:
149
+ text = ""
 
 
 
 
 
 
 
 
150
 
151
+ result["text_content"] += f"\n\n=== {name} ===\n{text}"
 
152
 
153
+ return result
154
 
155
 
156
  # ==============================================================
157
+ # HF MODEL CALL (Robust: conversational support)
 
 
158
  # ==============================================================
159
 
160
+ def extract_with_hf(processed_data: Dict[str, Any]) -> Dict[str, Any]:
161
+ hf_token = os.getenv("HF_TOKEN")
162
+ model = os.getenv("HF_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
 
 
 
163
 
164
+ client = InferenceClient(model=model, token=hf_token)
165
 
166
  prompt = (
167
  EXTRACTION_PROMPT
168
+ + "\n\nDOCUMENT TEXT:\n"
169
+ + processed_data["text_content"]
170
  + "\n\nATTACHMENTS:\n"
171
+ + json.dumps(processed_data["attachments"])
 
172
  )
173
 
174
  raw = ""
 
 
 
 
 
 
 
 
 
 
 
175
 
176
+ try:
177
+ # FIRST: try conversational (works for Mistral)
178
+ conv = client.conversational(
179
+ {
180
+ "past_user_inputs": [],
181
+ "generated_responses": [],
182
+ "text": prompt,
 
 
 
183
  }
184
+ )
185
+ raw = conv["generated_text"]
186
 
187
+ except Exception as e1:
188
  try:
189
+ # fallback to chat
190
+ resp = client.chat_completion(
191
+ messages=[
192
+ {"role": "system", "content": "Return strict JSON only."},
193
+ {"role": "user", "content": prompt}
194
+ ],
195
  temperature=0.1,
196
+ max_tokens=3000
 
197
  )
198
+ raw = resp.choices[0].message.content
199
+
200
  except Exception as e2:
201
  return {
202
  "success": False,
203
+ "error": f"Model call failed:\n{e1}\n\n{e2}",
204
+ "traceback": traceback.format_exc()
205
  }
206
 
 
207
  try:
208
+ parsed = extract_json(raw)
 
209
  return {
210
  "success": True,
211
+ "data": parsed,
212
+ "raw": raw
 
213
  }
214
+ except Exception as je:
215
  return {
216
  "success": False,
217
+ "error": f"JSON parse error: {je}",
218
+ "raw": raw
 
 
 
 
219
  }
220
 
221
 
222
  # ==============================================================
223
+ # MAIN PROCESS
224
  # ==============================================================
225
 
226
  def process_documents(files):
227
+ if not files:
228
+ return "❌ Upload file", "{}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ paths = [f.name if hasattr(f, "name") else f for f in files]
 
 
231
 
232
+ status = "πŸ“„ Extracting text...\n"
233
+ processed = process_files(paths)
 
 
 
234
 
235
+ status += "πŸ€– Calling HF model...\n"
236
+ result = extract_with_hf(processed)
 
 
 
 
237
 
238
+ if result["success"]:
239
+ json_out = json.dumps(result["data"], indent=2)
240
+ return "βœ… Success", json_out, json_out
241
 
242
+ return f"❌ Extraction failed:\n{result['error']}", "{}", result.get("raw", "")
 
 
243
 
244
 
245
  # ==============================================================
246
+ # UI
247
  # ==============================================================
248
 
249
+ with gr.Blocks() as demo:
250
+ gr.Markdown("# πŸ“„ Logistic OCR – Open Source Version")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ file_input = gr.File(file_count="multiple")
253
+ btn = gr.Button("πŸš€ Extract")
254
+ status = gr.Textbox(label="Status")
255
+ json_out = gr.Code(language="json")
256
+ preview = gr.Textbox(label="Preview")
 
 
 
 
 
 
 
 
 
257
 
258
+ btn.click(
259
+ process_documents,
260
+ inputs=file_input,
261
+ outputs=[status, json_out, preview]
 
 
 
262
  )
263
+
264
+ demo.launch(server_name="0.0.0.0", server_port=7860)