Spaces:

MLBench
/

logistics_ocr

Running

App Files Files Community

mlbench123 commited on Feb 17

Commit

ec91976

verified ·

1 Parent(s): e478996

Update app.py

Browse files

Files changed (1) hide show

app.py +241 -188

app.py CHANGED Viewed

@@ -2,25 +2,25 @@ import gradio as gr
 import json
 import os
 from pathlib import Path
-from typing import List, Dict, Any
-import google.generativeai as genai
 from PIL import Image
 import PyPDF2
-import tempfile
-import traceback
 # ==============================================================
-# API Configuration - Add your key here
-# ==============================================================
-GEMINI_API_KEY = "AIzaSyDbIO57s0DlXMXRoKHKKrJNUcKytwbee-g"
-# ==============================================================
-# Enhanced extraction prompt with better instructions
 # ==============================================================
-EXTRACTION_PROMPT = """You are an expert shipping-document data extractor with OCR capabilities.
-Carefully analyze ALL text content from PDFs, images, and documents.
-CRITICAL: Look at both the text AND the visual layout of documents. Sometimes important data
-is in tables, handwritten notes, stamps, or poorly scanned areas.
 Extract and structure the data as valid JSON only (no markdown, no commentary):
@@ -60,238 +60,297 @@ Extract and structure the data as valid JSON only (no markdown, no commentary):
 }
 EXTRACTION RULES:
-1. Extract ALL product line items - create one inventory item per product
-2. Parse dimensions: "2X6X14" → pcsHeight=2, pcsWidth=6, pcsLength=14
-3. BF = totalQuantity
 4. Convert BF to MBF: BF ÷ 1000
 5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
 6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
-7. Check headers, footers, stamps, handwritten notes, and table cells
-8. If multiple documents, consolidate all items into one JSON
-9. Return null for missing fields
-10.attachments should list all provided filenames
 Return ONLY valid JSON matching this exact structure."""
 def extract_text_from_pdf(pdf_path: str) -> str:
-    """Extract text from PDF with better error handling"""
     try:
-        with open(pdf_path, 'rb') as file:
             pdf_reader = PyPDF2.PdfReader(file)
             text = ""
             for page_num, page in enumerate(pdf_reader.pages):
                 page_text = page.extract_text()
                 if page_text:
-                    text += f"\n--- Page {page_num + 1} ---\n{page_text}"
-            return text if text.strip() else "No text extracted from PDF"
     except Exception as e:
         return f"Error extracting PDF text: {str(e)}"
-def process_files_for_gemini(files: List[str]) -> Dict[str, Any]:
-    """Process files and prepare for Gemini multimodal input"""
     processed_data = {
         "text_content": "",
-        "file_objects": [],
         "attachments": [],
-        "file_info": []
     }
     if not files:
         return processed_data
     for file_path in files:
         if not os.path.exists(file_path):
             continue
         file_name = Path(file_path).name
         file_ext = Path(file_path).suffix.lower()
         processed_data["attachments"].append(file_name)
         processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
         try:
-            # Handle PDFs
-            if file_ext == '.pdf':
-                text = extract_text_from_pdf(file_path)
                 processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
-                # Upload PDF to Gemini for visual analysis
-                uploaded_file = genai.upload_file(file_path)
-                processed_data["file_objects"].append(uploaded_file)
-            # Handle images
-            elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
-                # Upload image to Gemini
-                uploaded_file = genai.upload_file(file_path)
-                processed_data["file_objects"].append(uploaded_file)
-                processed_data["text_content"] += f"\n\n=== {file_name} (Image) ===\n[Image uploaded for visual analysis]"
-            # Handle text files
-            elif file_ext in ['.txt', '.csv']:
-                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-                    text = f.read()
                 processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
-            # Handle Word documents (basic text extraction)
-            elif file_ext in ['.doc', '.docx']:
-                try:
-                    import docx
-                    doc = docx.Document(file_path)
-                    text = "\n".join([para.text for para in doc.paragraphs])
-                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
-                except ImportError:
-                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Word document - install python-docx for text extraction]"
-                except Exception as e:
-                    processed_data["text_content"] += f"\n\n=== {file_name} ===\nError reading Word doc: {str(e)}"
         except Exception as e:
             processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
     return processed_data
-def extract_with_gemini(processed_data: Dict[str, Any], api_key: str, model_name: str = "gemini-2.0-flash") -> Dict[str, Any]:
-    """Extract structured data using Gemini with enhanced multimodal processing"""
-    if not api_key or api_key.strip() == "":
-        return {
-            "success": False,
-            "error": "Gemini API key not provided"
-        }
     try:
-        # Configure Gemini
-        genai.configure(api_key=api_key)
-        # Use the latest model with vision capabilities
-        model = genai.GenerativeModel(model_name)
-        # Build multimodal prompt
-        content_parts = [
-            EXTRACTION_PROMPT,
-            f"\n\nDOCUMENT CONTEXT:\n{processed_data['text_content']}\n",
-            f"\nATTACHMENTS: {json.dumps(processed_data['attachments'])}\n",
-            "\nNow analyze the uploaded files carefully (including visual content) and extract the data as JSON:"
-        ]
-        # Add all uploaded files
-        content_parts.extend(processed_data["file_objects"])
-        # Generate with higher temperature for better extraction
-        generation_config = genai.types.GenerationConfig(
-            temperature=0.2,
-            max_output_tokens=8000,
         )
-        response = model.generate_content(
-            content_parts,
-            generation_config=generation_config
         )
-        response_text = response.text.strip()
-        # Clean markdown code blocks
-        if response_text.startswith("```json"):
-            response_text = response_text[7:]
-        elif response_text.startswith("```"):
-            response_text = response_text[3:]
-        if response_text.endswith("```"):
-            response_text = response_text[:-3]
-        response_text = response_text.strip()
-        # Parse JSON
-        extracted_data = json.loads(response_text)
         return {
             "success": True,
             "data": extracted_data,
-            "raw_response": response_text,
-            "files_processed": len(processed_data["file_objects"])
         }
     except json.JSONDecodeError as e:
         return {
             "success": False,
             "error": f"JSON parsing error: {str(e)}",
-            "raw_response": response.text if 'response' in locals() else "No response",
-            "suggestion": "The AI returned non-JSON text. Try again or check the raw response."
         }
     except Exception as e:
         return {
             "success": False,
             "error": f"Extraction error: {str(e)}",
-            "traceback": traceback.format_exc()
         }
 def process_documents(files):
-    """Main Gradio processing function"""
     if not files or len(files) == 0:
         return "❌ Error: Please upload at least one file", "{}", "No files provided"
-    # Use the hardcoded API key and default model
-    api_key = GEMINI_API_KEY
-    model_choice = "gemini-2.0-flash"
-    if not api_key or api_key.strip() == "":
-        return "❌ Error: API key not configured in code", "{}", "API key missing"
     try:
-        # Get file paths
-        file_paths = [f.name if hasattr(f, 'name') else f for f in files]
         status_msg = f"📄 Processing {len(file_paths)} file(s)...\n"
-        # Process files
-        processed_data = process_files_for_gemini(file_paths)
         status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
-        # Extract with Gemini
-        status_msg += "🤖 Extracting data with Gemini AI...\n"
-        result = extract_with_gemini(processed_data, api_key, model_choice)
         if result.get("success"):
             json_output = json.dumps(result["data"], indent=2)
-            status_msg += f"✅ Extraction successful! Processed {result.get('files_processed', 0)} files.\n"
-            # Format display output
-            display_text = "=== EXTRACTED DATA ===\n\n"
-            display_text += json_output
             return status_msg, json_output, display_text
-        else:
-            error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
-            if 'suggestion' in result:
-                error_msg += f"\n💡 {result['suggestion']}\n"
-            if 'traceback' in result:
-                error_msg += f"\nDebug info:\n{result['traceback'][:500]}"
-            raw_resp = result.get('raw_response', 'No response')
-            return error_msg, "{}", f"Raw Response:\n{raw_resp[:1000]}"
     except Exception as e:
-        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:500]}"
         return error_msg, "{}", error_msg
 # ==============================================================
-# Gradio Interface
 # ==============================================================
 def create_interface():
     with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
         gr.Markdown("""
         # 📄 Shipping Document Data Extractor
-        Upload PDFs, images, Word docs, or text files to extract structured shipping data using Google Gemini AI.
         **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
         """)
         with gr.Row():
             with gr.Column(scale=2):
                 file_input = gr.File(
@@ -299,62 +358,56 @@ def create_interface():
                     file_count="multiple",
                     file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
                 )
-                 # Add example button here
                 gr.Markdown("**Try with example:**")
                 example_btn = gr.Button("📄 Load Example PDF", size="sm", variant="secondary")
                 submit_btn = gr.Button("🚀 Extract Data", variant="primary", size="lg")
             with gr.Column(scale=3):
                 status_output = gr.Textbox(
                     label="📊 Status",
                     lines=4,
                     max_lines=8
                 )
                 json_output = gr.Code(
                     label="📋 JSON Output (Copy this)",
                     language="json",
                     lines=15
                 )
                 display_output = gr.Textbox(
                     label="👁️ Preview",
                     lines=10,
                     max_lines=15
                 )
         gr.Markdown("""
-        ### 💡 Tips:
-        - Upload multiple files for batch processing
-        - For images: ensure text is clear and well-lit
-        - For PDFs: both text-based and scanned PDFs work
-        - The AI will analyze visual content even if text extraction fails
         """)
         submit_btn.click(
             fn=process_documents,
             inputs=[file_input],
             outputs=[status_output, json_output, display_output]
         )
         def load_example():
             example_path = "example1.pdf"
             if os.path.exists(example_path):
-                # Return list of file paths for multiple file input
                 return [example_path]
-            else:
-                # If example doesn't exist, return empty list
-                print(f"Warning: Example file '{example_path}' not found")
-                return []
         example_btn.click(
             fn=load_example,
             inputs=None,
             outputs=file_input
         )
     return demo
@@ -365,4 +418,4 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=False
-    )

 import json
 import os
 from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+import traceback
 from PIL import Image
 import PyPDF2
+# Open-source OCR + PDF rendering
+import pytesseract
+from pdf2image import convert_from_path
+# Open-source model inference via Hugging Face
+from huggingface_hub import InferenceClient
 # ==============================================================
+# Extraction prompt (same schema you used; updated wording for OCR-first)
 # ==============================================================
+EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
+You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
 Extract and structure the data as valid JSON only (no markdown, no commentary):
 }
 EXTRACTION RULES:
+1. Extract ALL product line items - create one inventory item per product line
+2. Parse dimensions: "2X6X14" → pcsHeight=2, pcsWidth=6, pcsLength=14 (numbers only)
+3. BF = totalQuantity (if total board-feet is present)
 4. Convert BF to MBF: BF ÷ 1000
 5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
 6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
+7. If multiple documents, consolidate all items into one JSON
+8. Return null for missing fields
+9. attachments should list all provided filenames
 Return ONLY valid JSON matching this exact structure."""
+# ==============================================================
+# Utilities: JSON extraction/cleaning
+# ==============================================================
+def _strip_code_fences(s: str) -> str:
+    s = s.strip()
+    if s.startswith("```"):
+        # remove opening fence line
+        parts = s.split("\n", 1)
+        if len(parts) == 2:
+            s = parts[1]
+    if s.endswith("```"):
+        s = s[:-3]
+    return s.strip()
+def _extract_first_json_object(s: str) -> str:
+    """
+    Attempts to pull the first valid JSON object from a model response,
+    even if extra text exists before/after.
+    """
+    s = _strip_code_fences(s)
+    # Heuristic: find first '{' and last '}' (outermost object)
+    start = s.find("{")
+    end = s.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        raise json.JSONDecodeError("No JSON object found in response", s, 0)
+    return s[start:end + 1].strip()
+# ==============================================================
+# Text extraction: PDFs, images, docs
+# ==============================================================
 def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extract embedded text from PDF (works for text-based PDFs)."""
     try:
+        with open(pdf_path, "rb") as file:
             pdf_reader = PyPDF2.PdfReader(file)
             text = ""
             for page_num, page in enumerate(pdf_reader.pages):
                 page_text = page.extract_text()
                 if page_text:
+                    text += f"\n--- Page {page_num + 1} (PDF text) ---\n{page_text}"
+            return text.strip()
     except Exception as e:
         return f"Error extracting PDF text: {str(e)}"
+def ocr_image(image: Image.Image) -> str:
+    """OCR a PIL image using Tesseract."""
+    try:
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        return pytesseract.image_to_string(image)
+    except Exception as e:
+        return f"Error performing OCR on image: {str(e)}"
+def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
+    """
+    Extract text from PDF:
+    1) Try embedded text via PyPDF2
+    2) If empty/insufficient, render pages and OCR
+    """
+    embedded = extract_text_from_pdf(pdf_path)
+    # Consider embedded extraction "good" if it has meaningful length
+    if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
+        return embedded
+    # OCR fallback for scanned PDFs
+    try:
+        pages = convert_from_path(pdf_path, dpi=dpi)
+        ocr_chunks = []
+        for i, page_img in enumerate(pages):
+            page_text = ocr_image(page_img)
+            ocr_chunks.append(f"\n--- Page {i+1} (OCR) ---\n{page_text}")
+        merged = "\n".join(ocr_chunks).strip()
+        return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
+    except Exception as e:
+        # If poppler isn't installed, this will fail; surface clear error
+        msg = (
+            f"Error rendering PDF for OCR: {str(e)}\n"
+            f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
+        )
+        return msg
+def extract_text_from_docx(docx_path: str) -> str:
+    try:
+        import docx
+        doc = docx.Document(docx_path)
+        text = "\n".join([p.text for p in doc.paragraphs if p.text])
+        return text.strip()
+    except Exception as e:
+        return f"Error reading Word doc: {str(e)}"
+def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
+    """Process files locally (no Gemini upload)."""
     processed_data = {
         "text_content": "",
         "attachments": [],
+        "file_info": [],
     }
     if not files:
         return processed_data
     for file_path in files:
         if not os.path.exists(file_path):
             continue
         file_name = Path(file_path).name
         file_ext = Path(file_path).suffix.lower()
         processed_data["attachments"].append(file_name)
         processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
         try:
+            if file_ext == ".pdf":
+                text = extract_text_from_pdf_with_ocr(file_path)
                 processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+            elif file_ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]:
+                img = Image.open(file_path)
+                text = ocr_image(img)
+                processed_data["text_content"] += f"\n\n=== {file_name} (OCR) ===\n{text}"
+            elif file_ext in [".txt", ".csv"]:
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n{f.read()}"
+            elif file_ext in [".doc", ".docx"]:
+                text = extract_text_from_docx(file_path)
                 processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+            else:
+                processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Unsupported file type: {file_ext}]"
         except Exception as e:
             processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
     return processed_data
+# ==============================================================
+# Open-source model extraction via Hugging Face Inference API
+# ==============================================================
+def extract_with_hf_llm(
+    processed_data: Dict[str, Any],
+    model_id: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Uses Hugging Face Inference API for an open-source instruct model.
+    - Set HF_TOKEN as a Space Secret for better limits (optional).
+    - Optionally set HF_MODEL env var to change model without code edits.
+    """
     try:
+        hf_token = os.getenv("HF_TOKEN", "").strip() or None
+        model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
+        client = InferenceClient(model=model_id, token=hf_token)
+        prompt = (
+            EXTRACTION_PROMPT
+            + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
+            + processed_data.get("text_content", "")
+            + "\n\nATTACHMENTS:\n"
+            + json.dumps(processed_data.get("attachments", []))
+            + "\n\nReturn ONLY valid JSON."
         )
+        resp = client.chat_completion(
+            messages=[
+                {"role": "system", "content": "You extract structured data and return strict JSON only."},
+                {"role": "user", "content": prompt},
+            ],
+            temperature=0.1,
+            max_tokens=3000,
         )
+        raw = resp.choices[0].message.content if resp and resp.choices else ""
+        raw = (raw or "").strip()
+        json_text = _extract_first_json_object(raw)
+        extracted_data = json.loads(json_text)
         return {
             "success": True,
             "data": extracted_data,
+            "raw_response": raw,
+            "model": model_id,
         }
     except json.JSONDecodeError as e:
         return {
             "success": False,
             "error": f"JSON parsing error: {str(e)}",
+            "raw_response": raw if "raw" in locals() else "",
+            "suggestion": (
+                "Model returned non-JSON or malformed JSON. "
+                "Try again or switch HF_MODEL to a different instruct model."
+            ),
         }
     except Exception as e:
         return {
             "success": False,
             "error": f"Extraction error: {str(e)}",
+            "traceback": traceback.format_exc(),
         }
+# ==============================================================
+# Main Gradio function
+# ==============================================================
 def process_documents(files):
     if not files or len(files) == 0:
         return "❌ Error: Please upload at least one file", "{}", "No files provided"
     try:
+        file_paths = [f.name if hasattr(f, "name") else f for f in files]
         status_msg = f"📄 Processing {len(file_paths)} file(s)...\n"
+        # Local extraction (PDF text + OCR)
+        processed_data = process_files_for_extraction(file_paths)
         status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
+        status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
+        # If we extracted basically nothing, fail early with guidance
+        txt = (processed_data.get("text_content") or "").strip()
+        if len(txt) < 30:
+            msg = (
+                "❌ No usable text could be extracted.\n"
+                "If PDFs are scanned, ensure OCR dependencies are installed (tesseract-ocr + poppler-utils).\n"
+            )
+            return msg, "{}", msg
+        # LLM structuring
+        status_msg += "🤖 Structuring to JSON with open-source model (HF Inference API)...\n"
+        result = extract_with_hf_llm(processed_data)
         if result.get("success"):
             json_output = json.dumps(result["data"], indent=2)
+            status_msg += f"✅ Extraction successful! Model: {result.get('model')}\n"
+            display_text = "=== EXTRACTED DATA ===\n\n" + json_output
             return status_msg, json_output, display_text
+        # Failure case
+        error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
+        if "suggestion" in result:
+            error_msg += f"\n💡 {result['suggestion']}\n"
+        if "traceback" in result:
+            error_msg += f"\nDebug info:\n{result['traceback'][:800]}\n"
+        raw_resp = result.get("raw_response", "No response")
+        return error_msg, "{}", f"Raw Response:\n{raw_resp[:1500]}"
     except Exception as e:
+        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:800]}"
         return error_msg, "{}", error_msg
 # ==============================================================
+# Gradio Interface (kept essentially the same)
 # ==============================================================
 def create_interface():
     with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
         gr.Markdown("""
         # 📄 Shipping Document Data Extractor
+        Upload PDFs, images, Word docs, or text files to extract structured shipping data.
+        **Pipeline:** Local OCR/Text extraction → Open-source LLM (HF Inference API) → JSON
         **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
         """)
         with gr.Row():
             with gr.Column(scale=2):
                 file_input = gr.File(
                     file_count="multiple",
                     file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
                 )
                 gr.Markdown("**Try with example:**")
                 example_btn = gr.Button("📄 Load Example PDF", size="sm", variant="secondary")
                 submit_btn = gr.Button("🚀 Extract Data", variant="primary", size="lg")
             with gr.Column(scale=3):
                 status_output = gr.Textbox(
                     label="📊 Status",
                     lines=4,
                     max_lines=8
                 )
                 json_output = gr.Code(
                     label="📋 JSON Output (Copy this)",
                     language="json",
                     lines=15
                 )
                 display_output = gr.Textbox(
                     label="👁️ Preview",
                     lines=10,
                     max_lines=15
                 )
         gr.Markdown("""
+        ### 💡 Notes
+        - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
+        - For better throughput, set **HF_TOKEN** in Space Secrets.
+        - You can switch models by setting **HF_MODEL** (e.g., `mistralai/Mistral-7B-Instruct-v0.3`).
         """)
         submit_btn.click(
             fn=process_documents,
             inputs=[file_input],
             outputs=[status_output, json_output, display_output]
         )
         def load_example():
+            # In Spaces, example file should be in repo root
             example_path = "example1.pdf"
             if os.path.exists(example_path):
                 return [example_path]
+            return []
         example_btn.click(
             fn=load_example,
             inputs=None,
             outputs=file_input
         )
     return demo
         server_name="0.0.0.0",
         server_port=7860,
         share=False
+    )