Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
-
from typing import List, Dict, Any, Optional
|
| 6 |
import traceback
|
| 7 |
|
| 8 |
from PIL import Image
|
|
@@ -17,7 +17,7 @@ from huggingface_hub import InferenceClient
|
|
| 17 |
|
| 18 |
|
| 19 |
# ==============================================================
|
| 20 |
-
# Extraction prompt (
|
| 21 |
# ==============================================================
|
| 22 |
EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
|
| 23 |
You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
|
|
@@ -78,24 +78,25 @@ Return ONLY valid JSON matching this exact structure."""
|
|
| 78 |
# ==============================================================
|
| 79 |
|
| 80 |
def _strip_code_fences(s: str) -> str:
|
| 81 |
-
s = s.strip()
|
| 82 |
if s.startswith("```"):
|
| 83 |
-
# remove opening fence line
|
| 84 |
parts = s.split("\n", 1)
|
| 85 |
if len(parts) == 2:
|
| 86 |
s = parts[1]
|
|
|
|
|
|
|
| 87 |
if s.endswith("```"):
|
| 88 |
s = s[:-3]
|
| 89 |
return s.strip()
|
| 90 |
|
|
|
|
| 91 |
def _extract_first_json_object(s: str) -> str:
|
| 92 |
"""
|
| 93 |
-
|
| 94 |
-
even if extra text exists before/after.
|
| 95 |
"""
|
| 96 |
s = _strip_code_fences(s)
|
| 97 |
|
| 98 |
-
# Heuristic: find first '{' and last '}' (outermost object)
|
| 99 |
start = s.find("{")
|
| 100 |
end = s.rfind("}")
|
| 101 |
if start == -1 or end == -1 or end <= start:
|
|
@@ -121,6 +122,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
|
|
| 121 |
except Exception as e:
|
| 122 |
return f"Error extracting PDF text: {str(e)}"
|
| 123 |
|
|
|
|
| 124 |
def ocr_image(image: Image.Image) -> str:
|
| 125 |
"""OCR a PIL image using Tesseract."""
|
| 126 |
try:
|
|
@@ -130,6 +132,7 @@ def ocr_image(image: Image.Image) -> str:
|
|
| 130 |
except Exception as e:
|
| 131 |
return f"Error performing OCR on image: {str(e)}"
|
| 132 |
|
|
|
|
| 133 |
def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
|
| 134 |
"""
|
| 135 |
Extract text from PDF:
|
|
@@ -137,11 +140,9 @@ def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
|
|
| 137 |
2) If empty/insufficient, render pages and OCR
|
| 138 |
"""
|
| 139 |
embedded = extract_text_from_pdf(pdf_path)
|
| 140 |
-
# Consider embedded extraction "good" if it has meaningful length
|
| 141 |
if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
|
| 142 |
return embedded
|
| 143 |
|
| 144 |
-
# OCR fallback for scanned PDFs
|
| 145 |
try:
|
| 146 |
pages = convert_from_path(pdf_path, dpi=dpi)
|
| 147 |
ocr_chunks = []
|
|
@@ -151,12 +152,11 @@ def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
|
|
| 151 |
merged = "\n".join(ocr_chunks).strip()
|
| 152 |
return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
|
| 153 |
except Exception as e:
|
| 154 |
-
|
| 155 |
-
msg = (
|
| 156 |
f"Error rendering PDF for OCR: {str(e)}\n"
|
| 157 |
f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
|
| 158 |
)
|
| 159 |
-
|
| 160 |
|
| 161 |
def extract_text_from_docx(docx_path: str) -> str:
|
| 162 |
try:
|
|
@@ -169,7 +169,7 @@ def extract_text_from_docx(docx_path: str) -> str:
|
|
| 169 |
|
| 170 |
|
| 171 |
def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
|
| 172 |
-
"""Process files locally (no Gemini
|
| 173 |
processed_data = {
|
| 174 |
"text_content": "",
|
| 175 |
"attachments": [],
|
|
@@ -218,32 +218,31 @@ def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
|
|
| 218 |
|
| 219 |
# ==============================================================
|
| 220 |
# Open-source model extraction via Hugging Face Inference API
|
|
|
|
|
|
|
| 221 |
# ==============================================================
|
| 222 |
|
| 223 |
def extract_with_hf_llm(
|
| 224 |
processed_data: Dict[str, Any],
|
| 225 |
model_id: Optional[str] = None,
|
| 226 |
) -> Dict[str, Any]:
|
| 227 |
-
"""
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
+ "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
|
| 241 |
-
+ processed_data.get("text_content", "")
|
| 242 |
-
+ "\n\nATTACHMENTS:\n"
|
| 243 |
-
+ json.dumps(processed_data.get("attachments", []))
|
| 244 |
-
+ "\n\nReturn ONLY valid JSON."
|
| 245 |
-
)
|
| 246 |
|
|
|
|
|
|
|
|
|
|
| 247 |
resp = client.chat_completion(
|
| 248 |
messages=[
|
| 249 |
{"role": "system", "content": "You extract structured data and return strict JSON only."},
|
|
@@ -252,36 +251,55 @@ def extract_with_hf_llm(
|
|
| 252 |
temperature=0.1,
|
| 253 |
max_tokens=3000,
|
| 254 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
json_text = _extract_first_json_object(raw)
|
| 260 |
extracted_data = json.loads(json_text)
|
| 261 |
-
|
| 262 |
return {
|
| 263 |
"success": True,
|
| 264 |
"data": extracted_data,
|
| 265 |
"raw_response": raw,
|
| 266 |
"model": model_id,
|
| 267 |
}
|
| 268 |
-
|
| 269 |
-
except json.JSONDecodeError as e:
|
| 270 |
return {
|
| 271 |
"success": False,
|
| 272 |
-
"error": f"JSON parsing error: {str(
|
| 273 |
-
"raw_response": raw
|
| 274 |
"suggestion": (
|
| 275 |
"Model returned non-JSON or malformed JSON. "
|
| 276 |
-
"Try
|
| 277 |
),
|
| 278 |
}
|
| 279 |
-
except Exception as e:
|
| 280 |
-
return {
|
| 281 |
-
"success": False,
|
| 282 |
-
"error": f"Extraction error: {str(e)}",
|
| 283 |
-
"traceback": traceback.format_exc(),
|
| 284 |
-
}
|
| 285 |
|
| 286 |
|
| 287 |
# ==============================================================
|
|
@@ -301,7 +319,6 @@ def process_documents(files):
|
|
| 301 |
status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
|
| 302 |
status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
|
| 303 |
|
| 304 |
-
# If we extracted basically nothing, fail early with guidance
|
| 305 |
txt = (processed_data.get("text_content") or "").strip()
|
| 306 |
if len(txt) < 30:
|
| 307 |
msg = (
|
|
@@ -317,7 +334,6 @@ def process_documents(files):
|
|
| 317 |
if result.get("success"):
|
| 318 |
json_output = json.dumps(result["data"], indent=2)
|
| 319 |
status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
|
| 320 |
-
|
| 321 |
display_text = "=== EXTRACTED DATA ===\n\n" + json_output
|
| 322 |
return status_msg, json_output, display_text
|
| 323 |
|
|
@@ -326,22 +342,22 @@ def process_documents(files):
|
|
| 326 |
if "suggestion" in result:
|
| 327 |
error_msg += f"\n💡 {result['suggestion']}\n"
|
| 328 |
if "traceback" in result:
|
| 329 |
-
error_msg += f"\nDebug info:\n{result['traceback'][:
|
| 330 |
|
| 331 |
raw_resp = result.get("raw_response", "No response")
|
| 332 |
-
return error_msg, "{}", f"Raw Response:\n{raw_resp[:
|
| 333 |
|
| 334 |
except Exception as e:
|
| 335 |
-
error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:
|
| 336 |
return error_msg, "{}", error_msg
|
| 337 |
|
| 338 |
|
| 339 |
# ==============================================================
|
| 340 |
-
# Gradio Interface
|
| 341 |
# ==============================================================
|
| 342 |
|
| 343 |
def create_interface():
|
| 344 |
-
with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
|
| 345 |
gr.Markdown("""
|
| 346 |
# 📄 Shipping Document Data Extractor
|
| 347 |
|
|
@@ -387,7 +403,7 @@ def create_interface():
|
|
| 387 |
### 💡 Notes
|
| 388 |
- For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
|
| 389 |
- For better throughput, set **HF_TOKEN** in Space Secrets.
|
| 390 |
-
-
|
| 391 |
""")
|
| 392 |
|
| 393 |
submit_btn.click(
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
import traceback
|
| 7 |
|
| 8 |
from PIL import Image
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
# ==============================================================
|
| 20 |
+
# Extraction prompt (JSON schema)
|
| 21 |
# ==============================================================
|
| 22 |
EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
|
| 23 |
You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
|
|
|
|
| 78 |
# ==============================================================
|
| 79 |
|
| 80 |
def _strip_code_fences(s: str) -> str:
|
| 81 |
+
s = (s or "").strip()
|
| 82 |
if s.startswith("```"):
|
| 83 |
+
# remove opening fence line (optionally "```json")
|
| 84 |
parts = s.split("\n", 1)
|
| 85 |
if len(parts) == 2:
|
| 86 |
s = parts[1]
|
| 87 |
+
else:
|
| 88 |
+
s = s.replace("```", "", 1)
|
| 89 |
if s.endswith("```"):
|
| 90 |
s = s[:-3]
|
| 91 |
return s.strip()
|
| 92 |
|
| 93 |
+
|
| 94 |
def _extract_first_json_object(s: str) -> str:
|
| 95 |
"""
|
| 96 |
+
Pull the first JSON object from a model response, even if extra text exists.
|
|
|
|
| 97 |
"""
|
| 98 |
s = _strip_code_fences(s)
|
| 99 |
|
|
|
|
| 100 |
start = s.find("{")
|
| 101 |
end = s.rfind("}")
|
| 102 |
if start == -1 or end == -1 or end <= start:
|
|
|
|
| 122 |
except Exception as e:
|
| 123 |
return f"Error extracting PDF text: {str(e)}"
|
| 124 |
|
| 125 |
+
|
| 126 |
def ocr_image(image: Image.Image) -> str:
|
| 127 |
"""OCR a PIL image using Tesseract."""
|
| 128 |
try:
|
|
|
|
| 132 |
except Exception as e:
|
| 133 |
return f"Error performing OCR on image: {str(e)}"
|
| 134 |
|
| 135 |
+
|
| 136 |
def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
|
| 137 |
"""
|
| 138 |
Extract text from PDF:
|
|
|
|
| 140 |
2) If empty/insufficient, render pages and OCR
|
| 141 |
"""
|
| 142 |
embedded = extract_text_from_pdf(pdf_path)
|
|
|
|
| 143 |
if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
|
| 144 |
return embedded
|
| 145 |
|
|
|
|
| 146 |
try:
|
| 147 |
pages = convert_from_path(pdf_path, dpi=dpi)
|
| 148 |
ocr_chunks = []
|
|
|
|
| 152 |
merged = "\n".join(ocr_chunks).strip()
|
| 153 |
return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
|
| 154 |
except Exception as e:
|
| 155 |
+
return (
|
|
|
|
| 156 |
f"Error rendering PDF for OCR: {str(e)}\n"
|
| 157 |
f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
|
| 158 |
)
|
| 159 |
+
|
| 160 |
|
| 161 |
def extract_text_from_docx(docx_path: str) -> str:
|
| 162 |
try:
|
|
|
|
| 169 |
|
| 170 |
|
| 171 |
def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
|
| 172 |
+
"""Process files locally (no Gemini)."""
|
| 173 |
processed_data = {
|
| 174 |
"text_content": "",
|
| 175 |
"attachments": [],
|
|
|
|
| 218 |
|
| 219 |
# ==============================================================
|
| 220 |
# Open-source model extraction via Hugging Face Inference API
|
| 221 |
+
# - Tries chat endpoint
|
| 222 |
+
# - If model isn't chat-compatible, falls back to text generation endpoint
|
| 223 |
# ==============================================================
|
| 224 |
|
| 225 |
def extract_with_hf_llm(
|
| 226 |
processed_data: Dict[str, Any],
|
| 227 |
model_id: Optional[str] = None,
|
| 228 |
) -> Dict[str, Any]:
|
| 229 |
+
hf_token = os.getenv("HF_TOKEN", "").strip() or None
|
| 230 |
+
model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
|
| 231 |
+
|
| 232 |
+
client = InferenceClient(model=model_id, token=hf_token)
|
| 233 |
+
|
| 234 |
+
prompt = (
|
| 235 |
+
EXTRACTION_PROMPT
|
| 236 |
+
+ "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
|
| 237 |
+
+ (processed_data.get("text_content", "") or "")
|
| 238 |
+
+ "\n\nATTACHMENTS:\n"
|
| 239 |
+
+ json.dumps(processed_data.get("attachments", []))
|
| 240 |
+
+ "\n\nReturn ONLY valid JSON."
|
| 241 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
+
raw = ""
|
| 244 |
+
try:
|
| 245 |
+
# Try chat-completions first (works for chat-enabled models)
|
| 246 |
resp = client.chat_completion(
|
| 247 |
messages=[
|
| 248 |
{"role": "system", "content": "You extract structured data and return strict JSON only."},
|
|
|
|
| 251 |
temperature=0.1,
|
| 252 |
max_tokens=3000,
|
| 253 |
)
|
| 254 |
+
raw = (resp.choices[0].message.content or "").strip()
|
| 255 |
+
|
| 256 |
+
except Exception as e:
|
| 257 |
+
# If model is not chat-compatible, fall back to text generation
|
| 258 |
+
msg = str(e)
|
| 259 |
+
is_not_chat = ("not a chat model" in msg.lower()) or ("model_not_supported" in msg.lower())
|
| 260 |
|
| 261 |
+
if not is_not_chat:
|
| 262 |
+
return {
|
| 263 |
+
"success": False,
|
| 264 |
+
"error": f"Extraction error: {msg}",
|
| 265 |
+
"traceback": traceback.format_exc(),
|
| 266 |
+
}
|
| 267 |
|
| 268 |
+
try:
|
| 269 |
+
gen = client.text_generation(
|
| 270 |
+
prompt,
|
| 271 |
+
temperature=0.1,
|
| 272 |
+
max_new_tokens=3000,
|
| 273 |
+
return_full_text=False,
|
| 274 |
+
)
|
| 275 |
+
raw = (gen or "").strip()
|
| 276 |
+
except Exception as e2:
|
| 277 |
+
return {
|
| 278 |
+
"success": False,
|
| 279 |
+
"error": f"Text-generation fallback failed: {str(e2)}",
|
| 280 |
+
"traceback": traceback.format_exc(),
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
# Parse JSON robustly
|
| 284 |
+
try:
|
| 285 |
json_text = _extract_first_json_object(raw)
|
| 286 |
extracted_data = json.loads(json_text)
|
|
|
|
| 287 |
return {
|
| 288 |
"success": True,
|
| 289 |
"data": extracted_data,
|
| 290 |
"raw_response": raw,
|
| 291 |
"model": model_id,
|
| 292 |
}
|
| 293 |
+
except json.JSONDecodeError as je:
|
|
|
|
| 294 |
return {
|
| 295 |
"success": False,
|
| 296 |
+
"error": f"JSON parsing error: {str(je)}",
|
| 297 |
+
"raw_response": raw,
|
| 298 |
"suggestion": (
|
| 299 |
"Model returned non-JSON or malformed JSON. "
|
| 300 |
+
"Try another HF_MODEL (e.g., Qwen/Qwen2.5-7B-Instruct), or reduce max_new_tokens."
|
| 301 |
),
|
| 302 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
|
| 304 |
|
| 305 |
# ==============================================================
|
|
|
|
| 319 |
status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
|
| 320 |
status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
|
| 321 |
|
|
|
|
| 322 |
txt = (processed_data.get("text_content") or "").strip()
|
| 323 |
if len(txt) < 30:
|
| 324 |
msg = (
|
|
|
|
| 334 |
if result.get("success"):
|
| 335 |
json_output = json.dumps(result["data"], indent=2)
|
| 336 |
status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
|
|
|
|
| 337 |
display_text = "=== EXTRACTED DATA ===\n\n" + json_output
|
| 338 |
return status_msg, json_output, display_text
|
| 339 |
|
|
|
|
| 342 |
if "suggestion" in result:
|
| 343 |
error_msg += f"\n💡 {result['suggestion']}\n"
|
| 344 |
if "traceback" in result:
|
| 345 |
+
error_msg += f"\nDebug info:\n{result['traceback'][:1200]}\n"
|
| 346 |
|
| 347 |
raw_resp = result.get("raw_response", "No response")
|
| 348 |
+
return error_msg, "{}", f"Raw Response:\n{raw_resp[:2000]}"
|
| 349 |
|
| 350 |
except Exception as e:
|
| 351 |
+
error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:1200]}"
|
| 352 |
return error_msg, "{}", error_msg
|
| 353 |
|
| 354 |
|
| 355 |
# ==============================================================
|
| 356 |
+
# Gradio Interface
|
| 357 |
# ==============================================================
|
| 358 |
|
| 359 |
def create_interface():
|
| 360 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Shipping Document Data Extractor") as demo:
|
| 361 |
gr.Markdown("""
|
| 362 |
# 📄 Shipping Document Data Extractor
|
| 363 |
|
|
|
|
| 403 |
### 💡 Notes
|
| 404 |
- For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
|
| 405 |
- For better throughput, set **HF_TOKEN** in Space Secrets.
|
| 406 |
+
- Switch models by setting **HF_MODEL** (e.g., `Qwen/Qwen2.5-7B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`).
|
| 407 |
""")
|
| 408 |
|
| 409 |
submit_btn.click(
|