Spaces:

MLBench
/

logistics_ocr

Sleeping

File size: 13,608 Bytes

import gradio as gr
import json
import os
from pathlib import Path
from typing import List, Dict, Any
import google.generativeai as genai
from PIL import Image
import PyPDF2
import tempfile
import traceback

# ==============================================================
# API Configuration - Add your key here
# ==============================================================
GEMINI_API_KEY = "AIzaSyAK2di4YWAGkO7nHcat7h0DuqNQeV7kH88"  
# ==============================================================
# Enhanced extraction prompt with better instructions
# ==============================================================
EXTRACTION_PROMPT = """You are an expert shipping-document data extractor with OCR capabilities.
Carefully analyze ALL text content from PDFs, images, and documents.

CRITICAL: Look at both the text AND the visual layout of documents. Sometimes important data 
is in tables, handwritten notes, stamps, or poorly scanned areas.

Extract and structure the data as valid JSON only (no markdown, no commentary):

{
  "poNumber": string | null,
  "shipFrom": string | null,
  "carrierType": string | null,
  "originCarrier": string | null,
  "railCarNumber": string | null,
  "totalQuantity": number | null,
  "totalUnits": string | null,
  "attachments": [string],
  "accountName": string | null,
  "inventories": {
    "items": [
      {
        "quantityShipped": number | null,
        "inventoryUnits": string | null,
        "pcs": number | null,
        "productName": string | null,
        "productCode": string | null,
        "product": {
          "category": number | null,
          "defaultUnits": string | null,
          "unit": string | null,
          "pcs": number | null,
          "mbf": number | null,
          "sf": number | null,
          "pcsHeight": number | null,
          "pcsWidth": number | null,
          "pcsLength": number | null
        },
        "customFields": [string]
      }
    ]
  }
}

EXTRACTION RULES:
1. Extract ALL product line items - create one inventory item per product
2. Parse dimensions: "2X6X14" → pcsHeight=2, pcsWidth=6, pcsLength=14
3. BF = totalQuantity
4. Convert BF to MBF: BF ÷ 1000
5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
7. Check headers, footers, stamps, handwritten notes, and table cells
8. If multiple documents, consolidate all items into one JSON
9. Return null for missing fields
10.attachments should list all provided filenames

Return ONLY valid JSON matching this exact structure."""


def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF with better error handling"""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()
                if page_text:
                    text += f"\n--- Page {page_num + 1} ---\n{page_text}"
            return text if text.strip() else "No text extracted from PDF"
    except Exception as e:
        return f"Error extracting PDF text: {str(e)}"


def process_files_for_gemini(files: List[str]) -> Dict[str, Any]:
    """Process files and prepare for Gemini multimodal input"""
    processed_data = {
        "text_content": "",
        "file_objects": [],
        "attachments": [],
        "file_info": []
    }
    
    if not files:
        return processed_data
    
    for file_path in files:
        if not os.path.exists(file_path):
            continue
            
        file_name = Path(file_path).name
        file_ext = Path(file_path).suffix.lower()
        
        processed_data["attachments"].append(file_name)
        processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
        
        try:
            # Handle PDFs
            if file_ext == '.pdf':
                text = extract_text_from_pdf(file_path)
                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
                
                # Upload PDF to Gemini for visual analysis
                uploaded_file = genai.upload_file(file_path)
                processed_data["file_objects"].append(uploaded_file)
            
            # Handle images
            elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
                # Upload image to Gemini
                uploaded_file = genai.upload_file(file_path)
                processed_data["file_objects"].append(uploaded_file)
                processed_data["text_content"] += f"\n\n=== {file_name} (Image) ===\n[Image uploaded for visual analysis]"
            
            # Handle text files
            elif file_ext in ['.txt', '.csv']:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
            
            # Handle Word documents (basic text extraction)
            elif file_ext in ['.doc', '.docx']:
                try:
                    import docx
                    doc = docx.Document(file_path)
                    text = "\n".join([para.text for para in doc.paragraphs])
                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
                except ImportError:
                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Word document - install python-docx for text extraction]"
                except Exception as e:
                    processed_data["text_content"] += f"\n\n=== {file_name} ===\nError reading Word doc: {str(e)}"
        
        except Exception as e:
            processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
    
    return processed_data


def extract_with_gemini(processed_data: Dict[str, Any], api_key: str, model_name: str = "gemini-2.0-flash") -> Dict[str, Any]:
    """Extract structured data using Gemini with enhanced multimodal processing"""
    
    if not api_key or api_key.strip() == "":
        return {
            "success": False,
            "error": "Gemini API key not provided"
        }
    
    try:
        # Configure Gemini
        genai.configure(api_key=api_key)
        
        # Use the latest model with vision capabilities
        model = genai.GenerativeModel(model_name)
        
        # Build multimodal prompt
        content_parts = [
            EXTRACTION_PROMPT,
            f"\n\nDOCUMENT CONTEXT:\n{processed_data['text_content']}\n",
            f"\nATTACHMENTS: {json.dumps(processed_data['attachments'])}\n",
            "\nNow analyze the uploaded files carefully (including visual content) and extract the data as JSON:"
        ]
        
        # Add all uploaded files
        content_parts.extend(processed_data["file_objects"])
        
        # Generate with higher temperature for better extraction
        generation_config = genai.types.GenerationConfig(
            temperature=0.2,
            max_output_tokens=8000,
        )
        
        response = model.generate_content(
            content_parts,
            generation_config=generation_config
        )
        
        response_text = response.text.strip()
        
        # Clean markdown code blocks
        if response_text.startswith("```json"):
            response_text = response_text[7:]
        elif response_text.startswith("```"):
            response_text = response_text[3:]
        if response_text.endswith("```"):
            response_text = response_text[:-3]
        
        response_text = response_text.strip()
        
        # Parse JSON
        extracted_data = json.loads(response_text)
        
        return {
            "success": True,
            "data": extracted_data,
            "raw_response": response_text,
            "files_processed": len(processed_data["file_objects"])
        }
    
    except json.JSONDecodeError as e:
        return {
            "success": False,
            "error": f"JSON parsing error: {str(e)}",
            "raw_response": response.text if 'response' in locals() else "No response",
            "suggestion": "The AI returned non-JSON text. Try again or check the raw response."
        }
    except Exception as e:
        return {
            "success": False,
            "error": f"Extraction error: {str(e)}",
            "traceback": traceback.format_exc()
        }


def process_documents(files):
    """Main Gradio processing function"""
    
    if not files or len(files) == 0:
        return "❌ Error: Please upload at least one file", "{}", "No files provided"
    
    # Use the hardcoded API key and default model
    api_key = GEMINI_API_KEY
    model_choice = "gemini-2.0-flash"
    
    if not api_key or api_key.strip() == "":
        return "❌ Error: API key not configured in code", "{}", "API key missing"
    
    try:
        # Get file paths
        file_paths = [f.name if hasattr(f, 'name') else f for f in files]
        
        status_msg = f"📄 Processing {len(file_paths)} file(s)...\n"
        
        # Process files
        processed_data = process_files_for_gemini(file_paths)
        status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
        
        # Extract with Gemini
        status_msg += "🤖 Extracting data with Gemini AI...\n"
        result = extract_with_gemini(processed_data, api_key, model_choice)
        
        if result.get("success"):
            json_output = json.dumps(result["data"], indent=2)
            status_msg += f"✅ Extraction successful! Processed {result.get('files_processed', 0)} files.\n"
            
            # Format display output
            display_text = "=== EXTRACTED DATA ===\n\n"
            display_text += json_output
            
            return status_msg, json_output, display_text
        else:
            error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
            if 'suggestion' in result:
                error_msg += f"\n💡 {result['suggestion']}\n"
            if 'traceback' in result:
                error_msg += f"\nDebug info:\n{result['traceback'][:500]}"
            
            raw_resp = result.get('raw_response', 'No response')
            return error_msg, "{}", f"Raw Response:\n{raw_resp[:1000]}"
    
    except Exception as e:
        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:500]}"
        return error_msg, "{}", error_msg


# ==============================================================
# Gradio Interface
# ==============================================================

def create_interface():
    with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
        gr.Markdown("""
        # 📄 Shipping Document Data Extractor
        
        Upload PDFs, images, Word docs, or text files to extract structured shipping data using Google Gemini AI.
        
        **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                file_input = gr.File(
                    label="📎 Upload Documents",
                    file_count="multiple",
                    file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
                )
                
                 # Add example button here
                gr.Markdown("**Try with example:**")
                example_btn = gr.Button("📄 Load Example PDF", size="sm", variant="secondary")
            
                submit_btn = gr.Button("🚀 Extract Data", variant="primary", size="lg")
                     
            with gr.Column(scale=3):
                status_output = gr.Textbox(
                    label="📊 Status",
                    lines=4,
                    max_lines=8
                )
                
                json_output = gr.Code(
                    label="📋 JSON Output (Copy this)",
                    language="json",
                    lines=15
                )
                
                display_output = gr.Textbox(
                    label="👁️ Preview",
                    lines=10,
                    max_lines=15
                )
        
        gr.Markdown("""
        ### 💡 Tips:
        - Upload multiple files for batch processing
        - For images: ensure text is clear and well-lit
        - For PDFs: both text-based and scanned PDFs work
        - The AI will analyze visual content even if text extraction fails
        """)
        
        submit_btn.click(
            fn=process_documents,
            inputs=[file_input],
            outputs=[status_output, json_output, display_output]
        )
        
        def load_example():
            example_path = "example1.pdf"
            if os.path.exists(example_path):
                # Return list of file paths for multiple file input
                return [example_path]
            else:
                # If example doesn't exist, return empty list
                print(f"Warning: Example file '{example_path}' not found")
                return []
        
        example_btn.click(
            fn=load_example,
            inputs=None,
            outputs=file_input
        )
        

    return demo


if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )