mlbench123 commited on
Commit
13f5bf7
·
verified ·
1 Parent(s): a8b669f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -54
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import json
3
  import os
4
  from pathlib import Path
5
- from typing import List, Dict, Any, Optional, Tuple
6
  import traceback
7
 
8
  from PIL import Image
@@ -17,7 +17,7 @@ from huggingface_hub import InferenceClient
17
 
18
 
19
  # ==============================================================
20
- # Extraction prompt (same schema you used; updated wording for OCR-first)
21
  # ==============================================================
22
  EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
23
  You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
@@ -78,24 +78,25 @@ Return ONLY valid JSON matching this exact structure."""
78
  # ==============================================================
79
 
80
  def _strip_code_fences(s: str) -> str:
81
- s = s.strip()
82
  if s.startswith("```"):
83
- # remove opening fence line
84
  parts = s.split("\n", 1)
85
  if len(parts) == 2:
86
  s = parts[1]
 
 
87
  if s.endswith("```"):
88
  s = s[:-3]
89
  return s.strip()
90
 
 
91
  def _extract_first_json_object(s: str) -> str:
92
  """
93
- Attempts to pull the first valid JSON object from a model response,
94
- even if extra text exists before/after.
95
  """
96
  s = _strip_code_fences(s)
97
 
98
- # Heuristic: find first '{' and last '}' (outermost object)
99
  start = s.find("{")
100
  end = s.rfind("}")
101
  if start == -1 or end == -1 or end <= start:
@@ -121,6 +122,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
121
  except Exception as e:
122
  return f"Error extracting PDF text: {str(e)}"
123
 
 
124
  def ocr_image(image: Image.Image) -> str:
125
  """OCR a PIL image using Tesseract."""
126
  try:
@@ -130,6 +132,7 @@ def ocr_image(image: Image.Image) -> str:
130
  except Exception as e:
131
  return f"Error performing OCR on image: {str(e)}"
132
 
 
133
  def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
134
  """
135
  Extract text from PDF:
@@ -137,11 +140,9 @@ def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
137
  2) If empty/insufficient, render pages and OCR
138
  """
139
  embedded = extract_text_from_pdf(pdf_path)
140
- # Consider embedded extraction "good" if it has meaningful length
141
  if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
142
  return embedded
143
 
144
- # OCR fallback for scanned PDFs
145
  try:
146
  pages = convert_from_path(pdf_path, dpi=dpi)
147
  ocr_chunks = []
@@ -151,12 +152,11 @@ def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
151
  merged = "\n".join(ocr_chunks).strip()
152
  return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
153
  except Exception as e:
154
- # If poppler isn't installed, this will fail; surface clear error
155
- msg = (
156
  f"Error rendering PDF for OCR: {str(e)}\n"
157
  f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
158
  )
159
- return msg
160
 
161
  def extract_text_from_docx(docx_path: str) -> str:
162
  try:
@@ -169,7 +169,7 @@ def extract_text_from_docx(docx_path: str) -> str:
169
 
170
 
171
  def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
172
- """Process files locally (no Gemini upload)."""
173
  processed_data = {
174
  "text_content": "",
175
  "attachments": [],
@@ -218,32 +218,31 @@ def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
218
 
219
  # ==============================================================
220
  # Open-source model extraction via Hugging Face Inference API
 
 
221
  # ==============================================================
222
 
223
  def extract_with_hf_llm(
224
  processed_data: Dict[str, Any],
225
  model_id: Optional[str] = None,
226
  ) -> Dict[str, Any]:
227
- """
228
- Uses Hugging Face Inference API for an open-source instruct model.
229
- - Set HF_TOKEN as a Space Secret for better limits (optional).
230
- - Optionally set HF_MODEL env var to change model without code edits.
231
- """
232
- try:
233
- hf_token = os.getenv("HF_TOKEN", "").strip() or None
234
- model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
235
-
236
- client = InferenceClient(model=model_id, token=hf_token)
237
-
238
- prompt = (
239
- EXTRACTION_PROMPT
240
- + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
241
- + processed_data.get("text_content", "")
242
- + "\n\nATTACHMENTS:\n"
243
- + json.dumps(processed_data.get("attachments", []))
244
- + "\n\nReturn ONLY valid JSON."
245
- )
246
 
 
 
 
247
  resp = client.chat_completion(
248
  messages=[
249
  {"role": "system", "content": "You extract structured data and return strict JSON only."},
@@ -252,36 +251,55 @@ def extract_with_hf_llm(
252
  temperature=0.1,
253
  max_tokens=3000,
254
  )
 
 
 
 
 
 
255
 
256
- raw = resp.choices[0].message.content if resp and resp.choices else ""
257
- raw = (raw or "").strip()
 
 
 
 
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  json_text = _extract_first_json_object(raw)
260
  extracted_data = json.loads(json_text)
261
-
262
  return {
263
  "success": True,
264
  "data": extracted_data,
265
  "raw_response": raw,
266
  "model": model_id,
267
  }
268
-
269
- except json.JSONDecodeError as e:
270
  return {
271
  "success": False,
272
- "error": f"JSON parsing error: {str(e)}",
273
- "raw_response": raw if "raw" in locals() else "",
274
  "suggestion": (
275
  "Model returned non-JSON or malformed JSON. "
276
- "Try again or switch HF_MODEL to a different instruct model."
277
  ),
278
  }
279
- except Exception as e:
280
- return {
281
- "success": False,
282
- "error": f"Extraction error: {str(e)}",
283
- "traceback": traceback.format_exc(),
284
- }
285
 
286
 
287
  # ==============================================================
@@ -301,7 +319,6 @@ def process_documents(files):
301
  status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
302
  status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
303
 
304
- # If we extracted basically nothing, fail early with guidance
305
  txt = (processed_data.get("text_content") or "").strip()
306
  if len(txt) < 30:
307
  msg = (
@@ -317,7 +334,6 @@ def process_documents(files):
317
  if result.get("success"):
318
  json_output = json.dumps(result["data"], indent=2)
319
  status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
320
-
321
  display_text = "=== EXTRACTED DATA ===\n\n" + json_output
322
  return status_msg, json_output, display_text
323
 
@@ -326,22 +342,22 @@ def process_documents(files):
326
  if "suggestion" in result:
327
  error_msg += f"\n💡 {result['suggestion']}\n"
328
  if "traceback" in result:
329
- error_msg += f"\nDebug info:\n{result['traceback'][:800]}\n"
330
 
331
  raw_resp = result.get("raw_response", "No response")
332
- return error_msg, "{}", f"Raw Response:\n{raw_resp[:1500]}"
333
 
334
  except Exception as e:
335
- error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:800]}"
336
  return error_msg, "{}", error_msg
337
 
338
 
339
  # ==============================================================
340
- # Gradio Interface (kept essentially the same)
341
  # ==============================================================
342
 
343
  def create_interface():
344
- with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
345
  gr.Markdown("""
346
  # 📄 Shipping Document Data Extractor
347
 
@@ -387,7 +403,7 @@ def create_interface():
387
  ### 💡 Notes
388
  - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
389
  - For better throughput, set **HF_TOKEN** in Space Secrets.
390
- - You can switch models by setting **HF_MODEL** (e.g., `mistralai/Mistral-7B-Instruct-v0.3`).
391
  """)
392
 
393
  submit_btn.click(
 
2
  import json
3
  import os
4
  from pathlib import Path
5
+ from typing import List, Dict, Any, Optional
6
  import traceback
7
 
8
  from PIL import Image
 
17
 
18
 
19
  # ==============================================================
20
+ # Extraction prompt (JSON schema)
21
  # ==============================================================
22
  EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
23
  You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
 
78
  # ==============================================================
79
 
80
  def _strip_code_fences(s: str) -> str:
81
+ s = (s or "").strip()
82
  if s.startswith("```"):
83
+ # remove opening fence line (optionally "```json")
84
  parts = s.split("\n", 1)
85
  if len(parts) == 2:
86
  s = parts[1]
87
+ else:
88
+ s = s.replace("```", "", 1)
89
  if s.endswith("```"):
90
  s = s[:-3]
91
  return s.strip()
92
 
93
+
94
  def _extract_first_json_object(s: str) -> str:
95
  """
96
+ Pull the first JSON object from a model response, even if extra text exists.
 
97
  """
98
  s = _strip_code_fences(s)
99
 
 
100
  start = s.find("{")
101
  end = s.rfind("}")
102
  if start == -1 or end == -1 or end <= start:
 
122
  except Exception as e:
123
  return f"Error extracting PDF text: {str(e)}"
124
 
125
+
126
  def ocr_image(image: Image.Image) -> str:
127
  """OCR a PIL image using Tesseract."""
128
  try:
 
132
  except Exception as e:
133
  return f"Error performing OCR on image: {str(e)}"
134
 
135
+
136
  def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
137
  """
138
  Extract text from PDF:
 
140
  2) If empty/insufficient, render pages and OCR
141
  """
142
  embedded = extract_text_from_pdf(pdf_path)
 
143
  if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
144
  return embedded
145
 
 
146
  try:
147
  pages = convert_from_path(pdf_path, dpi=dpi)
148
  ocr_chunks = []
 
152
  merged = "\n".join(ocr_chunks).strip()
153
  return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
154
  except Exception as e:
155
+ return (
 
156
  f"Error rendering PDF for OCR: {str(e)}\n"
157
  f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
158
  )
159
+
160
 
161
  def extract_text_from_docx(docx_path: str) -> str:
162
  try:
 
169
 
170
 
171
  def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
172
+ """Process files locally (no Gemini)."""
173
  processed_data = {
174
  "text_content": "",
175
  "attachments": [],
 
218
 
219
  # ==============================================================
220
  # Open-source model extraction via Hugging Face Inference API
221
+ # - Tries chat endpoint
222
+ # - If model isn't chat-compatible, falls back to text generation endpoint
223
  # ==============================================================
224
 
225
  def extract_with_hf_llm(
226
  processed_data: Dict[str, Any],
227
  model_id: Optional[str] = None,
228
  ) -> Dict[str, Any]:
229
+ hf_token = os.getenv("HF_TOKEN", "").strip() or None
230
+ model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
231
+
232
+ client = InferenceClient(model=model_id, token=hf_token)
233
+
234
+ prompt = (
235
+ EXTRACTION_PROMPT
236
+ + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
237
+ + (processed_data.get("text_content", "") or "")
238
+ + "\n\nATTACHMENTS:\n"
239
+ + json.dumps(processed_data.get("attachments", []))
240
+ + "\n\nReturn ONLY valid JSON."
241
+ )
 
 
 
 
 
 
242
 
243
+ raw = ""
244
+ try:
245
+ # Try chat-completions first (works for chat-enabled models)
246
  resp = client.chat_completion(
247
  messages=[
248
  {"role": "system", "content": "You extract structured data and return strict JSON only."},
 
251
  temperature=0.1,
252
  max_tokens=3000,
253
  )
254
+ raw = (resp.choices[0].message.content or "").strip()
255
+
256
+ except Exception as e:
257
+ # If model is not chat-compatible, fall back to text generation
258
+ msg = str(e)
259
+ is_not_chat = ("not a chat model" in msg.lower()) or ("model_not_supported" in msg.lower())
260
 
261
+ if not is_not_chat:
262
+ return {
263
+ "success": False,
264
+ "error": f"Extraction error: {msg}",
265
+ "traceback": traceback.format_exc(),
266
+ }
267
 
268
+ try:
269
+ gen = client.text_generation(
270
+ prompt,
271
+ temperature=0.1,
272
+ max_new_tokens=3000,
273
+ return_full_text=False,
274
+ )
275
+ raw = (gen or "").strip()
276
+ except Exception as e2:
277
+ return {
278
+ "success": False,
279
+ "error": f"Text-generation fallback failed: {str(e2)}",
280
+ "traceback": traceback.format_exc(),
281
+ }
282
+
283
+ # Parse JSON robustly
284
+ try:
285
  json_text = _extract_first_json_object(raw)
286
  extracted_data = json.loads(json_text)
 
287
  return {
288
  "success": True,
289
  "data": extracted_data,
290
  "raw_response": raw,
291
  "model": model_id,
292
  }
293
+ except json.JSONDecodeError as je:
 
294
  return {
295
  "success": False,
296
+ "error": f"JSON parsing error: {str(je)}",
297
+ "raw_response": raw,
298
  "suggestion": (
299
  "Model returned non-JSON or malformed JSON. "
300
+ "Try another HF_MODEL (e.g., Qwen/Qwen2.5-7B-Instruct), or reduce max_new_tokens."
301
  ),
302
  }
 
 
 
 
 
 
303
 
304
 
305
  # ==============================================================
 
319
  status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
320
  status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
321
 
 
322
  txt = (processed_data.get("text_content") or "").strip()
323
  if len(txt) < 30:
324
  msg = (
 
334
  if result.get("success"):
335
  json_output = json.dumps(result["data"], indent=2)
336
  status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
 
337
  display_text = "=== EXTRACTED DATA ===\n\n" + json_output
338
  return status_msg, json_output, display_text
339
 
 
342
  if "suggestion" in result:
343
  error_msg += f"\n💡 {result['suggestion']}\n"
344
  if "traceback" in result:
345
+ error_msg += f"\nDebug info:\n{result['traceback'][:1200]}\n"
346
 
347
  raw_resp = result.get("raw_response", "No response")
348
+ return error_msg, "{}", f"Raw Response:\n{raw_resp[:2000]}"
349
 
350
  except Exception as e:
351
+ error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:1200]}"
352
  return error_msg, "{}", error_msg
353
 
354
 
355
  # ==============================================================
356
+ # Gradio Interface
357
  # ==============================================================
358
 
359
  def create_interface():
360
+ with gr.Blocks(theme=gr.themes.Soft(), title="Shipping Document Data Extractor") as demo:
361
  gr.Markdown("""
362
  # 📄 Shipping Document Data Extractor
363
 
 
403
  ### 💡 Notes
404
  - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
405
  - For better throughput, set **HF_TOKEN** in Space Secrets.
406
+ - Switch models by setting **HF_MODEL** (e.g., `Qwen/Qwen2.5-7B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`).
407
  """)
408
 
409
  submit_btn.click(