Spaces:

MLBench
/

logistics_ocr

Sleeping

App Files Files Community

mlbench123 commited on Feb 17

Commit

e277539

verified ·

1 Parent(s): 13f5bf7

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -296

app.py CHANGED Viewed

@@ -7,22 +7,19 @@ import traceback
 from PIL import Image
 import PyPDF2
-# Open-source OCR + PDF rendering
 import pytesseract
 from pdf2image import convert_from_path
-# Open-source model inference via Hugging Face
 from huggingface_hub import InferenceClient
 # ==============================================================
-# Extraction prompt (JSON schema)
 # ==============================================================
 EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
-You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
-Extract and structure the data as valid JSON only (no markdown, no commentary):
 {
   "poNumber": string | null,
@@ -59,379 +56,209 @@ Extract and structure the data as valid JSON only (no markdown, no commentary):
   }
 }
-EXTRACTION RULES:
-1. Extract ALL product line items - create one inventory item per product line
-2. Parse dimensions: "2X6X14" → pcsHeight=2, pcsWidth=6, pcsLength=14 (numbers only)
-3. BF = totalQuantity (if total board-feet is present)
-4. Convert BF to MBF: BF ÷ 1000
-5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
-6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
-7. If multiple documents, consolidate all items into one JSON
-8. Return null for missing fields
-9. attachments should list all provided filenames
-Return ONLY valid JSON matching this exact structure."""
 # ==============================================================
-# Utilities: JSON extraction/cleaning
 # ==============================================================
-def _strip_code_fences(s: str) -> str:
-    s = (s or "").strip()
-    if s.startswith("```"):
-        # remove opening fence line (optionally "```json")
-        parts = s.split("\n", 1)
-        if len(parts) == 2:
-            s = parts[1]
-        else:
-            s = s.replace("```", "", 1)
-    if s.endswith("```"):
-        s = s[:-3]
-    return s.strip()
-def _extract_first_json_object(s: str) -> str:
-    """
-    Pull the first JSON object from a model response, even if extra text exists.
-    """
-    s = _strip_code_fences(s)
-    start = s.find("{")
-    end = s.rfind("}")
-    if start == -1 or end == -1 or end <= start:
-        raise json.JSONDecodeError("No JSON object found in response", s, 0)
-    return s[start:end + 1].strip()
 # ==============================================================
-# Text extraction: PDFs, images, docs
 # ==============================================================
 def extract_text_from_pdf(pdf_path: str) -> str:
-    """Extract embedded text from PDF (works for text-based PDFs)."""
     try:
-        with open(pdf_path, "rb") as file:
-            pdf_reader = PyPDF2.PdfReader(file)
             text = ""
-            for page_num, page in enumerate(pdf_reader.pages):
-                page_text = page.extract_text()
-                if page_text:
-                    text += f"\n--- Page {page_num + 1} (PDF text) ---\n{page_text}"
-            return text.strip()
     except Exception as e:
-        return f"Error extracting PDF text: {str(e)}"
-def ocr_image(image: Image.Image) -> str:
-    """OCR a PIL image using Tesseract."""
-    try:
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        return pytesseract.image_to_string(image)
-    except Exception as e:
-        return f"Error performing OCR on image: {str(e)}"
-def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
-    """
-    Extract text from PDF:
-    1) Try embedded text via PyPDF2
-    2) If empty/insufficient, render pages and OCR
-    """
-    embedded = extract_text_from_pdf(pdf_path)
-    if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
-        return embedded
-    try:
-        pages = convert_from_path(pdf_path, dpi=dpi)
-        ocr_chunks = []
-        for i, page_img in enumerate(pages):
-            page_text = ocr_image(page_img)
-            ocr_chunks.append(f"\n--- Page {i+1} (OCR) ---\n{page_text}")
-        merged = "\n".join(ocr_chunks).strip()
-        return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
-    except Exception as e:
-        return (
-            f"Error rendering PDF for OCR: {str(e)}\n"
-            f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
-        )
-def extract_text_from_docx(docx_path: str) -> str:
-    try:
-        import docx
-        doc = docx.Document(docx_path)
-        text = "\n".join([p.text for p in doc.paragraphs if p.text])
-        return text.strip()
-    except Exception as e:
-        return f"Error reading Word doc: {str(e)}"
-def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
-    """Process files locally (no Gemini)."""
-    processed_data = {
         "text_content": "",
-        "attachments": [],
-        "file_info": [],
     }
-    if not files:
-        return processed_data
-    for file_path in files:
-        if not os.path.exists(file_path):
-            continue
-        file_name = Path(file_path).name
-        file_ext = Path(file_path).suffix.lower()
-        processed_data["attachments"].append(file_name)
-        processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
-        try:
-            if file_ext == ".pdf":
-                text = extract_text_from_pdf_with_ocr(file_path)
-                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
-            elif file_ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]:
-                img = Image.open(file_path)
-                text = ocr_image(img)
-                processed_data["text_content"] += f"\n\n=== {file_name} (OCR) ===\n{text}"
-            elif file_ext in [".txt", ".csv"]:
-                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n{f.read()}"
-            elif file_ext in [".doc", ".docx"]:
-                text = extract_text_from_docx(file_path)
-                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
-            else:
-                processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Unsupported file type: {file_ext}]"
-        except Exception as e:
-            processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
-    return processed_data
 # ==============================================================
-# Open-source model extraction via Hugging Face Inference API
-#   - Tries chat endpoint
-#   - If model isn't chat-compatible, falls back to text generation endpoint
 # ==============================================================
-def extract_with_hf_llm(
-    processed_data: Dict[str, Any],
-    model_id: Optional[str] = None,
-) -> Dict[str, Any]:
-    hf_token = os.getenv("HF_TOKEN", "").strip() or None
-    model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
-    client = InferenceClient(model=model_id, token=hf_token)
     prompt = (
         EXTRACTION_PROMPT
-        + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
-        + (processed_data.get("text_content", "") or "")
         + "\n\nATTACHMENTS:\n"
-        + json.dumps(processed_data.get("attachments", []))
-        + "\n\nReturn ONLY valid JSON."
     )
     raw = ""
-    try:
-        # Try chat-completions first (works for chat-enabled models)
-        resp = client.chat_completion(
-            messages=[
-                {"role": "system", "content": "You extract structured data and return strict JSON only."},
-                {"role": "user", "content": prompt},
-            ],
-            temperature=0.1,
-            max_tokens=3000,
-        )
-        raw = (resp.choices[0].message.content or "").strip()
-    except Exception as e:
-        # If model is not chat-compatible, fall back to text generation
-        msg = str(e)
-        is_not_chat = ("not a chat model" in msg.lower()) or ("model_not_supported" in msg.lower())
-        if not is_not_chat:
-            return {
-                "success": False,
-                "error": f"Extraction error: {msg}",
-                "traceback": traceback.format_exc(),
             }
         try:
-            gen = client.text_generation(
-                prompt,
                 temperature=0.1,
-                max_new_tokens=3000,
-                return_full_text=False,
             )
-            raw = (gen or "").strip()
         except Exception as e2:
             return {
                 "success": False,
-                "error": f"Text-generation fallback failed: {str(e2)}",
-                "traceback": traceback.format_exc(),
             }
-    # Parse JSON robustly
     try:
-        json_text = _extract_first_json_object(raw)
-        extracted_data = json.loads(json_text)
         return {
             "success": True,
-            "data": extracted_data,
-            "raw_response": raw,
-            "model": model_id,
         }
-    except json.JSONDecodeError as je:
         return {
             "success": False,
-            "error": f"JSON parsing error: {str(je)}",
-            "raw_response": raw,
-            "suggestion": (
-                "Model returned non-JSON or malformed JSON. "
-                "Try another HF_MODEL (e.g., Qwen/Qwen2.5-7B-Instruct), or reduce max_new_tokens."
-            ),
         }
 # ==============================================================
-# Main Gradio function
 # ==============================================================
 def process_documents(files):
-    if not files or len(files) == 0:
-        return "❌ Error: Please upload at least one file", "{}", "No files provided"
-    try:
-        file_paths = [f.name if hasattr(f, "name") else f for f in files]
-        status_msg = f"📄 Processing {len(file_paths)} file(s)...\n"
-        # Local extraction (PDF text + OCR)
-        processed_data = process_files_for_extraction(file_paths)
-        status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
-        status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
-        txt = (processed_data.get("text_content") or "").strip()
-        if len(txt) < 30:
-            msg = (
-                "❌ No usable text could be extracted.\n"
-                "If PDFs are scanned, ensure OCR dependencies are installed (tesseract-ocr + poppler-utils).\n"
-            )
-            return msg, "{}", msg
-        # LLM structuring
-        status_msg += "🤖 Structuring to JSON with open-source model (HF Inference API)...\n"
-        result = extract_with_hf_llm(processed_data)
-        if result.get("success"):
-            json_output = json.dumps(result["data"], indent=2)
-            status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
-            display_text = "=== EXTRACTED DATA ===\n\n" + json_output
-            return status_msg, json_output, display_text
-        # Failure case
-        error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
-        if "suggestion" in result:
-            error_msg += f"\n💡 {result['suggestion']}\n"
-        if "traceback" in result:
-            error_msg += f"\nDebug info:\n{result['traceback'][:1200]}\n"
-        raw_resp = result.get("raw_response", "No response")
-        return error_msg, "{}", f"Raw Response:\n{raw_resp[:2000]}"
-    except Exception as e:
-        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:1200]}"
-        return error_msg, "{}", error_msg
 # ==============================================================
-# Gradio Interface
 # ==============================================================
-def create_interface():
-    with gr.Blocks(theme=gr.themes.Soft(), title="Shipping Document Data Extractor") as demo:
-        gr.Markdown("""
-        # 📄 Shipping Document Data Extractor
-        Upload PDFs, images, Word docs, or text files to extract structured shipping data.
-        **Pipeline:** Local OCR/Text extraction → Open-source LLM (HF Inference API) → JSON
-        **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
-        """)
-        with gr.Row():
-            with gr.Column(scale=2):
-                file_input = gr.File(
-                    label="📎 Upload Documents",
-                    file_count="multiple",
-                    file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
-                )
-                gr.Markdown("**Try with example:**")
-                example_btn = gr.Button("📄 Load Example PDF", size="sm", variant="secondary")
-                submit_btn = gr.Button("🚀 Extract Data", variant="primary", size="lg")
-            with gr.Column(scale=3):
-                status_output = gr.Textbox(
-                    label="📊 Status",
-                    lines=4,
-                    max_lines=8
-                )
-                json_output = gr.Code(
-                    label="📋 JSON Output (Copy this)",
-                    language="json",
-                    lines=15
-                )
-                display_output = gr.Textbox(
-                    label="👁️ Preview",
-                    lines=10,
-                    max_lines=15
-                )
-        gr.Markdown("""
-        ### 💡 Notes
-        - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
-        - For better throughput, set **HF_TOKEN** in Space Secrets.
-        - Switch models by setting **HF_MODEL** (e.g., `Qwen/Qwen2.5-7B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`).
-        """)
-        submit_btn.click(
-            fn=process_documents,
-            inputs=[file_input],
-            outputs=[status_output, json_output, display_output]
-        )
-        def load_example():
-            # In Spaces, example file should be in repo root
-            example_path = "example1.pdf"
-            if os.path.exists(example_path):
-                return [example_path]
-            return []
-        example_btn.click(
-            fn=load_example,
-            inputs=None,
-            outputs=file_input
-        )
-    return demo
-if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
     )

 from PIL import Image
 import PyPDF2
 import pytesseract
 from pdf2image import convert_from_path
 from huggingface_hub import InferenceClient
 # ==============================================================
+# Extraction prompt
 # ==============================================================
 EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
+You will be given OCR/text extracted from shipping documents.
+Extract and return ONLY valid JSON matching this schema:
 {
   "poNumber": string | null,
   }
 }
+Return ONLY JSON. No explanation.
+"""
 # ==============================================================
+# JSON Helpers
 # ==============================================================
+def extract_json(text: str) -> Dict:
+    text = text.strip()
+    if text.startswith("```"):
+        text = text.split("\n", 1)[-1]
+        text = text.replace("```", "").strip()
+    start = text.find("{")
+    end = text.rfind("}")
+    if start == -1 or end == -1:
+        raise json.JSONDecodeError("No JSON found", text, 0)
+    return json.loads(text[start:end+1])
 # ==============================================================
+# OCR + TEXT EXTRACTION
 # ==============================================================
 def extract_text_from_pdf(pdf_path: str) -> str:
     try:
+        with open(pdf_path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
             text = ""
+            for page in reader.pages:
+                t = page.extract_text()
+                if t:
+                    text += t + "\n"
+            return text
     except Exception as e:
+        return f"PDF text error: {e}"
+def ocr_image(img: Image.Image) -> str:
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    return pytesseract.image_to_string(img)
+def extract_pdf_with_ocr(pdf_path: str) -> str:
+    text = extract_text_from_pdf(pdf_path)
+    if text and len(text) > 50:
+        return text
+    pages = convert_from_path(pdf_path, dpi=250)
+    ocr_text = ""
+    for p in pages:
+        ocr_text += ocr_image(p) + "\n"
+    return ocr_text
+def process_files(files: List[str]) -> Dict[str, Any]:
+    result = {
         "text_content": "",
+        "attachments": []
     }
+    for f in files:
+        name = Path(f).name
+        ext = Path(f).suffix.lower()
+        result["attachments"].append(name)
+        if ext == ".pdf":
+            text = extract_pdf_with_ocr(f)
+        elif ext in [".jpg", ".jpeg", ".png", ".webp"]:
+            img = Image.open(f)
+            text = ocr_image(img)
+        elif ext in [".txt", ".csv"]:
+            text = open(f, encoding="utf-8", errors="ignore").read()
+        elif ext in [".doc", ".docx"]:
+            import docx
+            doc = docx.Document(f)
+            text = "\n".join([p.text for p in doc.paragraphs])
+        else:
+            text = ""
+        result["text_content"] += f"\n\n=== {name} ===\n{text}"
+    return result
 # ==============================================================
+# HF MODEL CALL (Robust: conversational support)
 # ==============================================================
+def extract_with_hf(processed_data: Dict[str, Any]) -> Dict[str, Any]:
+    hf_token = os.getenv("HF_TOKEN")
+    model = os.getenv("HF_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
+    client = InferenceClient(model=model, token=hf_token)
     prompt = (
         EXTRACTION_PROMPT
+        + "\n\nDOCUMENT TEXT:\n"
+        + processed_data["text_content"]
         + "\n\nATTACHMENTS:\n"
+        + json.dumps(processed_data["attachments"])
     )
     raw = ""
+    try:
+        # FIRST: try conversational (works for Mistral)
+        conv = client.conversational(
+            {
+                "past_user_inputs": [],
+                "generated_responses": [],
+                "text": prompt,
             }
+        )
+        raw = conv["generated_text"]
+    except Exception as e1:
         try:
+            # fallback to chat
+            resp = client.chat_completion(
+                messages=[
+                    {"role": "system", "content": "Return strict JSON only."},
+                    {"role": "user", "content": prompt}
+                ],
                 temperature=0.1,
+                max_tokens=3000
             )
+            raw = resp.choices[0].message.content
         except Exception as e2:
             return {
                 "success": False,
+                "error": f"Model call failed:\n{e1}\n\n{e2}",
+                "traceback": traceback.format_exc()
             }
     try:
+        parsed = extract_json(raw)
         return {
             "success": True,
+            "data": parsed,
+            "raw": raw
         }
+    except Exception as je:
         return {
             "success": False,
+            "error": f"JSON parse error: {je}",
+            "raw": raw
         }
 # ==============================================================
+# MAIN PROCESS
 # ==============================================================
 def process_documents(files):
+    if not files:
+        return "❌ Upload file", "{}", ""
+    paths = [f.name if hasattr(f, "name") else f for f in files]
+    status = "📄 Extracting text...\n"
+    processed = process_files(paths)
+    status += "🤖 Calling HF model...\n"
+    result = extract_with_hf(processed)
+    if result["success"]:
+        json_out = json.dumps(result["data"], indent=2)
+        return "✅ Success", json_out, json_out
+    return f"❌ Extraction failed:\n{result['error']}", "{}", result.get("raw", "")
 # ==============================================================
+# UI
 # ==============================================================
+with gr.Blocks() as demo:
+    gr.Markdown("# 📄 Logistic OCR – Open Source Version")
+    file_input = gr.File(file_count="multiple")
+    btn = gr.Button("🚀 Extract")
+    status = gr.Textbox(label="Status")
+    json_out = gr.Code(language="json")
+    preview = gr.Textbox(label="Preview")
+    btn.click(
+        process_documents,
+        inputs=file_input,
+        outputs=[status, json_out, preview]
     )
+demo.launch(server_name="0.0.0.0", server_port=7860)