Spaces:

MLBench
/

logistics_ocr

Sleeping

App Files Files Community

mlbench123 commited on Nov 11, 2025

Commit

e128ae3

verified ·

1 Parent(s): 209097b

Create app.py

Browse files

Files changed (1) hide show

app.py +362 -0

app.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import gradio as gr
+import json
+import os
+from pathlib import Path
+from typing import List, Dict, Any
+import google.generativeai as genai
+from PIL import Image
+import PyPDF2
+import tempfile
+import traceback
+# ==============================================================
+# Enhanced extraction prompt with better instructions
+# ==============================================================
+EXTRACTION_PROMPT = """You are an expert shipping-document data extractor with OCR capabilities.
+Carefully analyze ALL text content from PDFs, images, and documents.
+CRITICAL: Look at both the text AND the visual layout of documents. Sometimes important data
+is in tables, handwritten notes, stamps, or poorly scanned areas.
+Extract and structure the data as valid JSON only (no markdown, no commentary):
+{
+  "poNumber": string | null,
+  "shipFrom": string | null,
+  "carrierType": string | null,
+  "originCarrier": string | null,
+  "railCarNumber": string | null,
+  "totalQuantity": number | null,
+  "totalUnits": string | null,
+  "attachments": [string],
+  "accountName": string | null,
+  "inventories": {
+    "items": [
+      {
+        "quantityShipped": number | null,
+        "inventoryUnits": string | null,
+        "pcs": number | null,
+        "productName": string | null,
+        "productCode": string | null,
+        "product": {
+          "category": string | null,
+          "defaultUnits": string | null,
+          "unit": number | null,
+          "pcs": number | null,
+          "mbf": number | null,
+          "sf": number | null,
+          "pcsHeight": number | null,
+          "pcsWidth": number | null,
+          "pcsLength": number | null
+        },
+        "customFields": [string]
+      }
+    ]
+  }
+}
+EXTRACTION RULES:
+1. Extract ALL product line items - create one inventory item per product
+2. Parse dimensions: "2X6X14" → pcsHeight=2, pcsWidth=6, pcsLength=14
+3. Convert BF to MBF: BF ÷ 1000
+4. customFields format: "Key||Value" (e.g., "Mill||Tolko")
+5. Look for: PO numbers, shipping info, quantities, product codes, dimensions
+6. Check headers, footers, stamps, handwritten notes, and table cells
+7. If multiple documents, consolidate all items into one JSON
+8. Return null for missing fields
+9. attachments should list all provided filenames
+Return ONLY valid JSON matching this exact structure."""
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extract text from PDF with better error handling"""
+    try:
+        with open(pdf_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                page_text = page.extract_text()
+                if page_text:
+                    text += f"\n--- Page {page_num + 1} ---\n{page_text}"
+            return text if text.strip() else "No text extracted from PDF"
+    except Exception as e:
+        return f"Error extracting PDF text: {str(e)}"
+def process_files_for_gemini(files: List[str]) -> Dict[str, Any]:
+    """Process files and prepare for Gemini multimodal input"""
+    processed_data = {
+        "text_content": "",
+        "file_objects": [],
+        "attachments": [],
+        "file_info": []
+    }
+    if not files:
+        return processed_data
+    for file_path in files:
+        if not os.path.exists(file_path):
+            continue
+        file_name = Path(file_path).name
+        file_ext = Path(file_path).suffix.lower()
+        processed_data["attachments"].append(file_name)
+        processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
+        try:
+            # Handle PDFs
+            if file_ext == '.pdf':
+                text = extract_text_from_pdf(file_path)
+                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+                # Upload PDF to Gemini for visual analysis
+                uploaded_file = genai.upload_file(file_path)
+                processed_data["file_objects"].append(uploaded_file)
+            # Handle images
+            elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
+                # Upload image to Gemini
+                uploaded_file = genai.upload_file(file_path)
+                processed_data["file_objects"].append(uploaded_file)
+                processed_data["text_content"] += f"\n\n=== {file_name} (Image) ===\n[Image uploaded for visual analysis]"
+            # Handle text files
+            elif file_ext in ['.txt', '.csv']:
+                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    text = f.read()
+                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+            # Handle Word documents (basic text extraction)
+            elif file_ext in ['.doc', '.docx']:
+                try:
+                    import docx
+                    doc = docx.Document(file_path)
+                    text = "\n".join([para.text for para in doc.paragraphs])
+                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+                except ImportError:
+                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Word document - install python-docx for text extraction]"
+                except Exception as e:
+                    processed_data["text_content"] += f"\n\n=== {file_name} ===\nError reading Word doc: {str(e)}"
+        except Exception as e:
+            processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
+    return processed_data
+def extract_with_gemini(processed_data: Dict[str, Any], api_key: str, model_name: str = "gemini-2.0-flash-exp") -> Dict[str, Any]:
+    """Extract structured data using Gemini with enhanced multimodal processing"""
+    if not api_key or api_key.strip() == "":
+        return {
+            "success": False,
+            "error": "Gemini API key not provided"
+        }
+    try:
+        # Configure Gemini
+        genai.configure(api_key=api_key)
+        # Use the latest model with vision capabilities
+        model = genai.GenerativeModel(model_name)
+        # Build multimodal prompt
+        content_parts = [
+            EXTRACTION_PROMPT,
+            f"\n\nDOCUMENT CONTEXT:\n{processed_data['text_content']}\n",
+            f"\nATTACHMENTS: {json.dumps(processed_data['attachments'])}\n",
+            "\nNow analyze the uploaded files carefully (including visual content) and extract the data as JSON:"
+        ]
+        # Add all uploaded files
+        content_parts.extend(processed_data["file_objects"])
+        # Generate with higher temperature for better extraction
+        generation_config = genai.types.GenerationConfig(
+            temperature=0.2,
+            max_output_tokens=8000,
+        )
+        response = model.generate_content(
+            content_parts,
+            generation_config=generation_config
+        )
+        response_text = response.text.strip()
+        # Clean markdown code blocks
+        if response_text.startswith("```json"):
+            response_text = response_text[7:]
+        elif response_text.startswith("```"):
+            response_text = response_text[3:]
+        if response_text.endswith("```"):
+            response_text = response_text[:-3]
+        response_text = response_text.strip()
+        # Parse JSON
+        extracted_data = json.loads(response_text)
+        return {
+            "success": True,
+            "data": extracted_data,
+            "raw_response": response_text,
+            "files_processed": len(processed_data["file_objects"])
+        }
+    except json.JSONDecodeError as e:
+        return {
+            "success": False,
+            "error": f"JSON parsing error: {str(e)}",
+            "raw_response": response.text if 'response' in locals() else "No response",
+            "suggestion": "The AI returned non-JSON text. Try again or check the raw response."
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Extraction error: {str(e)}",
+            "traceback": traceback.format_exc()
+        }
+def process_documents(files, api_key, model_choice):
+    """Main Gradio processing function"""
+    if not files or len(files) == 0:
+        return "❌ Error: Please upload at least one file", "{}", "No files provided"
+    if not api_key or api_key.strip() == "":
+        return "❌ Error: Please enter your Gemini API key", "{}", "API key missing"
+    try:
+        # Get file paths
+        file_paths = [f.name if hasattr(f, 'name') else f for f in files]
+        status_msg = f"📄 Processing {len(file_paths)} file(s)...\n"
+        # Process files
+        processed_data = process_files_for_gemini(file_paths)
+        status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
+        # Extract with Gemini
+        status_msg += "🤖 Extracting data with Gemini AI...\n"
+        result = extract_with_gemini(processed_data, api_key, model_choice)
+        if result.get("success"):
+            json_output = json.dumps(result["data"], indent=2)
+            status_msg += f"✅ Extraction successful! Processed {result.get('files_processed', 0)} files.\n"
+            # Format display output
+            display_text = "=== EXTRACTED DATA ===\n\n"
+            display_text += json_output
+            return status_msg, json_output, display_text
+        else:
+            error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
+            if 'suggestion' in result:
+                error_msg += f"\n💡 {result['suggestion']}\n"
+            if 'traceback' in result:
+                error_msg += f"\nDebug info:\n{result['traceback'][:500]}"
+            raw_resp = result.get('raw_response', 'No response')
+            return error_msg, "{}", f"Raw Response:\n{raw_resp[:1000]}"
+    except Exception as e:
+        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:500]}"
+        return error_msg, "{}", error_msg
+# ==============================================================
+# Gradio Interface
+# ==============================================================
+def create_interface():
+    with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
+        gr.Markdown("""
+        # 📄 Shipping Document Data Extractor
+        Upload PDFs, images, Word docs, or text files to extract structured shipping data using Google Gemini AI.
+        **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                api_key_input = gr.Textbox(
+                    label="🔑 Gemini API Key",
+                    placeholder="Enter your Google Gemini API key (AIza...)",
+                    type="password",
+                    info="Get your key from https://aistudio.google.com/apikey"
+                )
+                model_choice = gr.Dropdown(
+                    choices=["gemini-2.0-flash-exp", "gemini-1.5-pro", "gemini-1.5-flash"],
+                    value="gemini-2.0-flash-exp",
+                    label="Model Selection",
+                    info="Latest model recommended for best results"
+                )
+                file_input = gr.File(
+                    label="📎 Upload Documents",
+                    file_count="multiple",
+                    file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
+                )
+                submit_btn = gr.Button("🚀 Extract Data", variant="primary", size="lg")
+            with gr.Column(scale=3):
+                status_output = gr.Textbox(
+                    label="📊 Status",
+                    lines=4,
+                    max_lines=8
+                )
+                json_output = gr.Code(
+                    label="📋 JSON Output (Copy this)",
+                    language="json",
+                    lines=15
+                )
+                display_output = gr.Textbox(
+                    label="👁️ Preview",
+                    lines=10,
+                    max_lines=15
+                )
+        gr.Markdown("""
+        ### 💡 Tips:
+        - Upload multiple files for batch processing
+        - For images: ensure text is clear and well-lit
+        - For PDFs: both text-based and scanned PDFs work
+        - The AI will analyze visual content even if text extraction fails
+        """)
+        # Button action
+        submit_btn.click(
+            fn=process_documents,
+            inputs=[file_input, api_key_input, model_choice],
+            outputs=[status_output, json_output, display_output]
+        )
+        # Examples
+        gr.Examples(
+            examples=[
+                [["example1.pdf"], "your-api-key-here"],
+            ],
+            inputs=[file_input, api_key_input],
+            label="Example Usage"
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )