Spaces:

MLBench
/

logistics_ocr

Running

App Files Files Community

mlbench123 commited on Feb 17

Commit

13f5bf7

verified ·

1 Parent(s): a8b669f

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -54

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import json
 import os
 from pathlib import Path
-from typing import List, Dict, Any, Optional, Tuple
 import traceback
 from PIL import Image
@@ -17,7 +17,7 @@ from huggingface_hub import InferenceClient
 # ==============================================================
-# Extraction prompt (same schema you used; updated wording for OCR-first)
 # ==============================================================
 EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
 You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
@@ -78,24 +78,25 @@ Return ONLY valid JSON matching this exact structure."""
 # ==============================================================
 def _strip_code_fences(s: str) -> str:
-    s = s.strip()
     if s.startswith("```"):
-        # remove opening fence line
         parts = s.split("\n", 1)
         if len(parts) == 2:
             s = parts[1]
     if s.endswith("```"):
         s = s[:-3]
     return s.strip()
 def _extract_first_json_object(s: str) -> str:
     """
-    Attempts to pull the first valid JSON object from a model response,
-    even if extra text exists before/after.
     """
     s = _strip_code_fences(s)
-    # Heuristic: find first '{' and last '}' (outermost object)
     start = s.find("{")
     end = s.rfind("}")
     if start == -1 or end == -1 or end <= start:
@@ -121,6 +122,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
     except Exception as e:
         return f"Error extracting PDF text: {str(e)}"
 def ocr_image(image: Image.Image) -> str:
     """OCR a PIL image using Tesseract."""
     try:
@@ -130,6 +132,7 @@ def ocr_image(image: Image.Image) -> str:
     except Exception as e:
         return f"Error performing OCR on image: {str(e)}"
 def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
     """
     Extract text from PDF:
@@ -137,11 +140,9 @@ def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
     2) If empty/insufficient, render pages and OCR
     """
     embedded = extract_text_from_pdf(pdf_path)
-    # Consider embedded extraction "good" if it has meaningful length
     if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
         return embedded
-    # OCR fallback for scanned PDFs
     try:
         pages = convert_from_path(pdf_path, dpi=dpi)
         ocr_chunks = []
@@ -151,12 +152,11 @@ def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
         merged = "\n".join(ocr_chunks).strip()
         return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
     except Exception as e:
-        # If poppler isn't installed, this will fail; surface clear error
-        msg = (
             f"Error rendering PDF for OCR: {str(e)}\n"
             f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
         )
-        return msg
 def extract_text_from_docx(docx_path: str) -> str:
     try:
@@ -169,7 +169,7 @@ def extract_text_from_docx(docx_path: str) -> str:
 def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
-    """Process files locally (no Gemini upload)."""
     processed_data = {
         "text_content": "",
         "attachments": [],
@@ -218,32 +218,31 @@ def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
 # ==============================================================
 # Open-source model extraction via Hugging Face Inference API
 # ==============================================================
 def extract_with_hf_llm(
     processed_data: Dict[str, Any],
     model_id: Optional[str] = None,
 ) -> Dict[str, Any]:
-    """
-    Uses Hugging Face Inference API for an open-source instruct model.
-    - Set HF_TOKEN as a Space Secret for better limits (optional).
-    - Optionally set HF_MODEL env var to change model without code edits.
-    """
-    try:
-        hf_token = os.getenv("HF_TOKEN", "").strip() or None
-        model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
-        client = InferenceClient(model=model_id, token=hf_token)
-        prompt = (
-            EXTRACTION_PROMPT
-            + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
-            + processed_data.get("text_content", "")
-            + "\n\nATTACHMENTS:\n"
-            + json.dumps(processed_data.get("attachments", []))
-            + "\n\nReturn ONLY valid JSON."
-        )
         resp = client.chat_completion(
             messages=[
                 {"role": "system", "content": "You extract structured data and return strict JSON only."},
@@ -252,36 +251,55 @@ def extract_with_hf_llm(
             temperature=0.1,
             max_tokens=3000,
         )
-        raw = resp.choices[0].message.content if resp and resp.choices else ""
-        raw = (raw or "").strip()
         json_text = _extract_first_json_object(raw)
         extracted_data = json.loads(json_text)
         return {
             "success": True,
             "data": extracted_data,
             "raw_response": raw,
             "model": model_id,
         }
-    except json.JSONDecodeError as e:
         return {
             "success": False,
-            "error": f"JSON parsing error: {str(e)}",
-            "raw_response": raw if "raw" in locals() else "",
             "suggestion": (
                 "Model returned non-JSON or malformed JSON. "
-                "Try again or switch HF_MODEL to a different instruct model."
             ),
         }
-    except Exception as e:
-        return {
-            "success": False,
-            "error": f"Extraction error: {str(e)}",
-            "traceback": traceback.format_exc(),
-        }
 # ==============================================================
@@ -301,7 +319,6 @@ def process_documents(files):
         status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
         status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
-        # If we extracted basically nothing, fail early with guidance
         txt = (processed_data.get("text_content") or "").strip()
         if len(txt) < 30:
             msg = (
@@ -317,7 +334,6 @@ def process_documents(files):
         if result.get("success"):
             json_output = json.dumps(result["data"], indent=2)
             status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
             display_text = "=== EXTRACTED DATA ===\n\n" + json_output
             return status_msg, json_output, display_text
@@ -326,22 +342,22 @@ def process_documents(files):
         if "suggestion" in result:
             error_msg += f"\n💡 {result['suggestion']}\n"
         if "traceback" in result:
-            error_msg += f"\nDebug info:\n{result['traceback'][:800]}\n"
         raw_resp = result.get("raw_response", "No response")
-        return error_msg, "{}", f"Raw Response:\n{raw_resp[:1500]}"
     except Exception as e:
-        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:800]}"
         return error_msg, "{}", error_msg
 # ==============================================================
-# Gradio Interface (kept essentially the same)
 # ==============================================================
 def create_interface():
-    with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
         gr.Markdown("""
         # 📄 Shipping Document Data Extractor
@@ -387,7 +403,7 @@ def create_interface():
         ### 💡 Notes
         - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
         - For better throughput, set **HF_TOKEN** in Space Secrets.
-        - You can switch models by setting **HF_MODEL** (e.g., `mistralai/Mistral-7B-Instruct-v0.3`).
         """)
         submit_btn.click(

 import json
 import os
 from pathlib import Path
+from typing import List, Dict, Any, Optional
 import traceback
 from PIL import Image
 # ==============================================================
+# Extraction prompt (JSON schema)
 # ==============================================================
 EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
 You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
 # ==============================================================
 def _strip_code_fences(s: str) -> str:
+    s = (s or "").strip()
     if s.startswith("```"):
+        # remove opening fence line (optionally "```json")
         parts = s.split("\n", 1)
         if len(parts) == 2:
             s = parts[1]
+        else:
+            s = s.replace("```", "", 1)
     if s.endswith("```"):
         s = s[:-3]
     return s.strip()
 def _extract_first_json_object(s: str) -> str:
     """
+    Pull the first JSON object from a model response, even if extra text exists.
     """
     s = _strip_code_fences(s)
     start = s.find("{")
     end = s.rfind("}")
     if start == -1 or end == -1 or end <= start:
     except Exception as e:
         return f"Error extracting PDF text: {str(e)}"
 def ocr_image(image: Image.Image) -> str:
     """OCR a PIL image using Tesseract."""
     try:
     except Exception as e:
         return f"Error performing OCR on image: {str(e)}"
 def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
     """
     Extract text from PDF:
     2) If empty/insufficient, render pages and OCR
     """
     embedded = extract_text_from_pdf(pdf_path)
     if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
         return embedded
     try:
         pages = convert_from_path(pdf_path, dpi=dpi)
         ocr_chunks = []
         merged = "\n".join(ocr_chunks).strip()
         return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
     except Exception as e:
+        return (
             f"Error rendering PDF for OCR: {str(e)}\n"
             f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
         )
 def extract_text_from_docx(docx_path: str) -> str:
     try:
 def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
+    """Process files locally (no Gemini)."""
     processed_data = {
         "text_content": "",
         "attachments": [],
 # ==============================================================
 # Open-source model extraction via Hugging Face Inference API
+#   - Tries chat endpoint
+#   - If model isn't chat-compatible, falls back to text generation endpoint
 # ==============================================================
 def extract_with_hf_llm(
     processed_data: Dict[str, Any],
     model_id: Optional[str] = None,
 ) -> Dict[str, Any]:
+    hf_token = os.getenv("HF_TOKEN", "").strip() or None
+    model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
+    client = InferenceClient(model=model_id, token=hf_token)
+    prompt = (
+        EXTRACTION_PROMPT
+        + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
+        + (processed_data.get("text_content", "") or "")
+        + "\n\nATTACHMENTS:\n"
+        + json.dumps(processed_data.get("attachments", []))
+        + "\n\nReturn ONLY valid JSON."
+    )
+    raw = ""
+    try:
+        # Try chat-completions first (works for chat-enabled models)
         resp = client.chat_completion(
             messages=[
                 {"role": "system", "content": "You extract structured data and return strict JSON only."},
             temperature=0.1,
             max_tokens=3000,
         )
+        raw = (resp.choices[0].message.content or "").strip()
+    except Exception as e:
+        # If model is not chat-compatible, fall back to text generation
+        msg = str(e)
+        is_not_chat = ("not a chat model" in msg.lower()) or ("model_not_supported" in msg.lower())
+        if not is_not_chat:
+            return {
+                "success": False,
+                "error": f"Extraction error: {msg}",
+                "traceback": traceback.format_exc(),
+            }
+        try:
+            gen = client.text_generation(
+                prompt,
+                temperature=0.1,
+                max_new_tokens=3000,
+                return_full_text=False,
+            )
+            raw = (gen or "").strip()
+        except Exception as e2:
+            return {
+                "success": False,
+                "error": f"Text-generation fallback failed: {str(e2)}",
+                "traceback": traceback.format_exc(),
+            }
+    # Parse JSON robustly
+    try:
         json_text = _extract_first_json_object(raw)
         extracted_data = json.loads(json_text)
         return {
             "success": True,
             "data": extracted_data,
             "raw_response": raw,
             "model": model_id,
         }
+    except json.JSONDecodeError as je:
         return {
             "success": False,
+            "error": f"JSON parsing error: {str(je)}",
+            "raw_response": raw,
             "suggestion": (
                 "Model returned non-JSON or malformed JSON. "
+                "Try another HF_MODEL (e.g., Qwen/Qwen2.5-7B-Instruct), or reduce max_new_tokens."
             ),
         }
 # ==============================================================
         status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
         status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
         txt = (processed_data.get("text_content") or "").strip()
         if len(txt) < 30:
             msg = (
         if result.get("success"):
             json_output = json.dumps(result["data"], indent=2)
             status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
             display_text = "=== EXTRACTED DATA ===\n\n" + json_output
             return status_msg, json_output, display_text
         if "suggestion" in result:
             error_msg += f"\n💡 {result['suggestion']}\n"
         if "traceback" in result:
+            error_msg += f"\nDebug info:\n{result['traceback'][:1200]}\n"
         raw_resp = result.get("raw_response", "No response")
+        return error_msg, "{}", f"Raw Response:\n{raw_resp[:2000]}"
     except Exception as e:
+        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:1200]}"
         return error_msg, "{}", error_msg
 # ==============================================================
+# Gradio Interface
 # ==============================================================
 def create_interface():
+    with gr.Blocks(theme=gr.themes.Soft(), title="Shipping Document Data Extractor") as demo:
         gr.Markdown("""
         # 📄 Shipping Document Data Extractor
         ### 💡 Notes
         - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
         - For better throughput, set **HF_TOKEN** in Space Secrets.
+        - Switch models by setting **HF_MODEL** (e.g., `Qwen/Qwen2.5-7B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`).
         """)
         submit_btn.click(