Spaces:

MLBench
/

shipping-text-extraction

Paused

App Files Files Community

Ayesha-Majeed commited on Nov 11, 2025

Commit

886d641

verified ·

1 Parent(s): e2f5d23

Update app.py

Browse files

Files changed (1) hide show

app.py +314 -142

app.py CHANGED Viewed

@@ -1,185 +1,357 @@
 import gradio as gr
 import json
 from pathlib import Path
 from typing import List, Dict, Any
 from PIL import Image
 import PyPDF2
-import pytesseract
-import google.generativeai as genai
 import tempfile
-# ==================== Configure Gemini API ====================
-GEMINI_API_KEY = "AIzaSyB2b80YwNHs3Yj6RZOTL8wjXk2YhxCluOA"
-if GEMINI_API_KEY:
-    genai.configure(api_key=GEMINI_API_KEY)
-EXTRACTION_PROMPT = """You are a shipping document data extraction specialist. Extract structured data from the provided shipping/logistics documents.
-Extract the following fields into a JSON format:
 {
-    "poNumber": "Purchase Order Number",
-    "shipFrom": "Origin/Ship From Location",
-    "carrierType": "Transportation type (RAIL/TRUCK/etc)",
-    "originCarrier": "Carrier name (CN/CPRS/etc)",
-    "railCarNumber": "Rail car identifier",
-    "totalQuantity": "Total quantity as number",
-    "totalUnits": "Unit type (UNIT/MBF/MSFT/etc)",
-    "accountName": "Customer/Account name",
-    "inventories": {
-        "items": [
-            {
-                "quantityShipped": "Quantity as number",
-                "inventoryUnits": "Unit type",
-                "productName": "Full product description",
-                "productCode": "Product code/SKU",
-                "product": {
-                    "category": "Product category (OSB/Lumber/etc)",
-                    "unit": "Unit count as number",
-                    "pcs": "Pieces per unit",
-                    "mbf": "Thousand board feet (if applicable)",
-                    "sf": "Square feet (if applicable)",
-                    "pcsHeight": "Height in inches",
-                    "pcsWidth": "Width in inches",
-                    "pcsLength": "Length in feet"
-                },
-                "customFields": [
-                    "Mill||Mill Name",
-                    "Vendor||Vendor Name"
-                ]
-            }
-        ]
-    }
 }
-IMPORTANT INSTRUCTIONS:
-1. Extract ALL products/items found in the document
-2. Convert text numbers to actual numbers (e.g., "54" → 54)
-3. Parse dimensions carefully, Do NOT convert units
-4. Calculate MBF/SF when possible from dimensions and piece count
-5. If a field is not found, use null
-6. For multiple products, create separate items
-7. Extract custom fields like Mill, Vendor
-Return ONLY valid JSON, no markdown formatting or explanations."""
-# ==================== Utility functions ====================
-def extract_text_from_pdf(pdf_file) -> str:
-    try:
-        pdf_reader = PyPDF2.PdfReader(pdf_file)
-        text = ""
-        for page in pdf_reader.pages:
-            text += page.extract_text() + "\n"
-        return text
-    except Exception as e:
-        return f"Error extracting PDF text: {str(e)}"
-def convert_pdf_to_images(pdf_file) -> List[Image.Image]:
-    try:
-        from pdf2image import convert_from_path
-        images = convert_from_path(pdf_file)
-        return images
-    except Exception as e:
-        print(f"Error converting PDF to images: {e}")
-        return []
-def extract_text_from_image(img: Image.Image) -> str:
     try:
-        text = pytesseract.image_to_string(img)
-        return text
     except Exception as e:
-        print(f"Error extracting text from image: {e}")
-        return ""
-def process_files(files: List[str]) -> Dict[str, Any]:
     processed_data = {
-        "files": [],
-        "combined_text": "",
-        "images": []
     }
     for file_path in files:
         file_name = Path(file_path).name
         file_ext = Path(file_path).suffix.lower()
-        file_data = {"filename": file_name, "type": file_ext, "content": ""}
         try:
             if file_ext == '.pdf':
                 text = extract_text_from_pdf(file_path)
-                file_data["content"] = text
-                processed_data["combined_text"] += f"\n--- {file_name} ---\n{text}\n"
-                images = convert_pdf_to_images(file_path)
-                processed_data["images"].extend(images)
-            elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']:
-                img = Image.open(file_path)
-                processed_data["images"].append(img)
-                text = extract_text_from_image(img)
-                processed_data["combined_text"] += f"\n--- {file_name} ---\n{text}\n"
-                file_data["content"] = f"Image file: {file_name}"
-            elif file_ext in ['.txt']:
-                with open(file_path, 'r', encoding='utf-8') as f:
                     text = f.read()
-                processed_data["combined_text"] += f"\n--- {file_name} ---\n{text}\n"
-                file_data["content"] = text
-            processed_data["files"].append(file_data)
         except Exception as e:
-            file_data["content"] = f"Error processing file: {str(e)}"
-            processed_data["files"].append(file_data)
     return processed_data
-def extract_with_gemini(processed_data: Dict[str, Any]) -> Dict[str, Any]:
     try:
-        model = genai.GenerativeModel('models/gemini-2.5-flash')
-        content = [EXTRACTION_PROMPT]
-        if processed_data["combined_text"]:
-            content.append(f"\nDocument Text:\n{processed_data['combined_text']}")
-        for img in processed_data["images"][:5]:
-            content.append(img)
-        response = model.generate_content(content)
         response_text = response.text.strip()
-        # Clean Markdown
-        for mark in ["```json", "```"]:
-            response_text = response_text.replace(mark, "")
         extracted_data = json.loads(response_text)
-        return extracted_data
     except Exception as e:
-        return {"error": str(e)}
-# ==================== Gradio function ====================
-def gradio_extraction(uploaded_files):
-    file_paths = []
-    for file in uploaded_files:
-        src_path = Path(file.name)
-        file_name = src_path.name
-        tmp_path = Path(tempfile.gettempdir()) / file_name
-        with open(src_path, "rb") as src, open(tmp_path, "wb") as dst:
-            dst.write(src.read())
-        file_paths.append(str(tmp_path))
-    processed_data = process_files(file_paths)
-    extracted_data = extract_with_gemini(processed_data)
-    with open("output.json", "w", encoding="utf-8") as f:
-        json.dump(extracted_data, f, indent=2)
-    return json.dumps(extracted_data, indent=2), "output.json"
-# ==================== Gradio Interface ====================
-iface = gr.Interface(
-    fn=gradio_extraction,
-    inputs = gr.File(file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".txt"], file_count="multiple"),
-    outputs=[
-        gr.Textbox(label="Extracted JSON",lines=15, max_lines=30),
-        gr.File(label="Download JSON")
-    ],
-    title="Shipping Document Text Extractor",
-    description="Upload PDFs or images of shipping/logistics documents and get structured JSON output.",
-    theme=gr.themes.Base(primary_hue="blue")
-)
-iface.launch()

 import gradio as gr
 import json
+import os
 from pathlib import Path
 from typing import List, Dict, Any
+import google.generativeai as genai
 from PIL import Image
 import PyPDF2
 import tempfile
+import traceback
+# ==============================================================
+# API Configuration - Add your key here
+# ==============================================================
+GEMINI_API_KEY = "AIzaSyDgNzVud08vXVrvFb5Mz0bHX8vBQ1LihNs"
+# ==============================================================
+# Enhanced extraction prompt with better instructions
+# ==============================================================
+EXTRACTION_PROMPT = """You are an expert shipping-document data extractor with OCR capabilities.
+Carefully analyze ALL text content from PDFs, images, and documents.
+CRITICAL: Look at both the text AND the visual layout of documents. Sometimes important data
+is in tables, handwritten notes, stamps, or poorly scanned areas.
+Extract and structure the data as valid JSON only (no markdown, no commentary):
 {
+  "poNumber": string | null,
+  "shipFrom": string | null,
+  "carrierType": string | null,
+  "originCarrier": string | null,
+  "railCarNumber": string | null,
+  "totalQuantity": number | null,
+  "totalUnits": string | null,
+  "attachments": [string],
+  "accountName": string | null,
+  "inventories": {
+    "items": [
+      {
+        "quantityShipped": "Quantity as number, no of packages",
+        "inventoryUnits": string | null,
+        "pcs": number | null,
+        "productName": string | null,
+        "productCode": string | null,
+        "product": {
+          "category": "Product category (OSB/Lumber/etc)",
+          "defaultUnits": string | null,
+          "unit": "Unit type from document (MBF, FBM, SF, UNIT etc.)",
+          "pcs": "pcs": "Pieces per unit",
+          "mbf": number | null,
+          "sf": number | null,
+          "pcsHeight": number | null,
+          "pcsWidth": number | null,
+          "pcsLength": number | null
+        },
+        "customFields": [string]
+      }
+    ]
+  }
 }
+EXTRACTION RULES:
+1. Extract ALL product line items - create one inventory item per product
+2. Parse dimensions: "2X6X14" → pcsHeight=2, pcsWidth=6, pcsLength=14
+3. BF = totalQuantity
+4. Convert BF to MBF: BF ÷ 1000
+5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
+6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
+7. Check headers, footers, stamps, handwritten notes, and table cells
+8. If multiple documents, consolidate all items into one JSON
+9. Return null for missing fields
+10.attachments should list all provided filenames
+Return ONLY valid JSON matching this exact structure."""
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extract text from PDF with better error handling"""
     try:
+        with open(pdf_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                page_text = page.extract_text()
+                if page_text:
+                    text += f"\n--- Page {page_num + 1} ---\n{page_text}"
+            return text if text.strip() else "No text extracted from PDF"
     except Exception as e:
+        return f"Error extracting PDF text: {str(e)}"
+def process_files_for_gemini(files: List[str]) -> Dict[str, Any]:
+    """Process files and prepare for Gemini multimodal input"""
     processed_data = {
+        "text_content": "",
+        "file_objects": [],
+        "attachments": [],
+        "file_info": []
     }
+    if not files:
+        return processed_data
     for file_path in files:
+        if not os.path.exists(file_path):
+            continue
         file_name = Path(file_path).name
         file_ext = Path(file_path).suffix.lower()
+        processed_data["attachments"].append(file_name)
+        processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
         try:
+            # Handle PDFs
             if file_ext == '.pdf':
                 text = extract_text_from_pdf(file_path)
+                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+                # Upload PDF to Gemini for visual analysis
+                uploaded_file = genai.upload_file(file_path)
+                processed_data["file_objects"].append(uploaded_file)
+            # Handle images
+            elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
+                # Upload image to Gemini
+                uploaded_file = genai.upload_file(file_path)
+                processed_data["file_objects"].append(uploaded_file)
+                processed_data["text_content"] += f"\n\n=== {file_name} (Image) ===\n[Image uploaded for visual analysis]"
+            # Handle text files
+            elif file_ext in ['.txt', '.csv']:
+                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                     text = f.read()
+                processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+            # Handle Word documents (basic text extraction)
+            elif file_ext in ['.doc', '.docx']:
+                try:
+                    import docx
+                    doc = docx.Document(file_path)
+                    text = "\n".join([para.text for para in doc.paragraphs])
+                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
+                except ImportError:
+                    processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Word document - install python-docx for text extraction]"
+                except Exception as e:
+                    processed_data["text_content"] += f"\n\n=== {file_name} ===\nError reading Word doc: {str(e)}"
         except Exception as e:
+            processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
     return processed_data
+def extract_with_gemini(processed_data: Dict[str, Any], api_key: str, model_name: str = "gemini-2.5-flash") -> Dict[str, Any]:
+    """Extract structured data using Gemini with enhanced multimodal processing"""
+    if not api_key or api_key.strip() == "":
+        return {
+            "success": False,
+            "error": "Gemini API key not provided"
+        }
     try:
+        # Configure Gemini
+        genai.configure(api_key=api_key)
+        # Use the latest model with vision capabilities
+        model = genai.GenerativeModel(model_name)
+        # Build multimodal prompt
+        content_parts = [
+            EXTRACTION_PROMPT,
+            f"\n\nDOCUMENT CONTEXT:\n{processed_data['text_content']}\n",
+            f"\nATTACHMENTS: {json.dumps(processed_data['attachments'])}\n",
+            "\nNow analyze the uploaded files carefully (including visual content) and extract the data as JSON:"
+        ]
+        # Add all uploaded files
+        content_parts.extend(processed_data["file_objects"])
+        # Generate with higher temperature for better extraction
+        generation_config = genai.types.GenerationConfig(
+            temperature=0.2,
+            max_output_tokens=8000,
+        )
+        response = model.generate_content(
+            content_parts,
+            generation_config=generation_config
+        )
         response_text = response.text.strip()
+        # Clean markdown code blocks
+        if response_text.startswith("```json"):
+            response_text = response_text[7:]
+        elif response_text.startswith("```"):
+            response_text = response_text[3:]
+        if response_text.endswith("```"):
+            response_text = response_text[:-3]
+        response_text = response_text.strip()
+        # Parse JSON
         extracted_data = json.loads(response_text)
+        return {
+            "success": True,
+            "data": extracted_data,
+            "raw_response": response_text,
+            "files_processed": len(processed_data["file_objects"])
+        }
+    except json.JSONDecodeError as e:
+        return {
+            "success": False,
+            "error": f"JSON parsing error: {str(e)}",
+            "raw_response": response.text if 'response' in locals() else "No response",
+            "suggestion": "The AI returned non-JSON text. Try again or check the raw response."
+        }
     except Exception as e:
+        return {
+            "success": False,
+            "error": f"Extraction error: {str(e)}",
+            "traceback": traceback.format_exc()
+        }
+def process_documents(files):
+    """Main Gradio processing function"""
+    if not files or len(files) == 0:
+        return "❌ Error: Please upload at least one file", "{}", "No files provided"
+    # Use the hardcoded API key and default model
+    api_key = GEMINI_API_KEY
+    model_choice = "gemini-2.0-flash"
+    if not api_key or api_key.strip() == "":
+        return "❌ Error: API key not configured in code", "{}", "API key missing"
+    try:
+        # Get file paths
+        file_paths = [f.name if hasattr(f, 'name') else f for f in files]
+        status_msg = f"📄 Processing {len(file_paths)} file(s)...\n"
+        # Process files
+        processed_data = process_files_for_gemini(file_paths)
+        status_msg += f"✓ Files loaded: {', '.join(processed_data['attachments'])}\n"
+        # Extract with Gemini
+        status_msg += "🤖 Extracting data with Gemini AI...\n"
+        result = extract_with_gemini(processed_data, api_key, model_choice)
+        if result.get("success"):
+            json_output = json.dumps(result["data"], indent=2)
+            status_msg += f"✅ Extraction successful! Processed {result.get('files_processed', 0)} files.\n"
+            # Format display output
+            display_text = "=== EXTRACTED DATA ===\n\n"
+            display_text += json_output
+            return status_msg, json_output, display_text
+        else:
+            error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
+            if 'suggestion' in result:
+                error_msg += f"\n💡 {result['suggestion']}\n"
+            if 'traceback' in result:
+                error_msg += f"\nDebug info:\n{result['traceback'][:500]}"
+            raw_resp = result.get('raw_response', 'No response')
+            return error_msg, "{}", f"Raw Response:\n{raw_resp[:1000]}"
+    except Exception as e:
+        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:500]}"
+        return error_msg, "{}", error_msg
+# ==============================================================
+# Gradio Interface
+# ==============================================================
+def create_interface():
+    with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
+        gr.Markdown("""
+        # 📄 Shipping Document Data Extractor
+        Upload PDFs, images, Word docs, or text files to extract structured shipping data using Google Gemini AI.
+        **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                file_input = gr.File(
+                    label="📎 Upload Documents",
+                    file_count="multiple",
+                    file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
+                )
+                submit_btn = gr.Button("🚀 Extract Data", variant="primary", size="lg")
+            with gr.Column(scale=3):
+                status_output = gr.Textbox(
+                    label="📊 Status",
+                    lines=4,
+                    max_lines=8
+                )
+                json_output = gr.Code(
+                    label="📋 JSON Output (Copy this)",
+                    language="json",
+                    lines=15
+                )
+                display_output = gr.Textbox(
+                    label="👁️ Preview",
+                    lines=10,
+                    max_lines=15
+                )
+        gr.Markdown("""
+        ### 💡 Tips:
+        - Upload multiple files for batch processing
+        - For images: ensure text is clear and well-lit
+        - For PDFs: both text-based and scanned PDFs work
+        - The AI will analyze visual content even if text extraction fails
+        """)
+        # Button action
+        submit_btn.click(
+            fn=process_documents,
+            inputs=[file_input],
+            outputs=[status_output, json_output, display_output]
+        )
+        # Examples
+        gr.Examples(
+            examples=[
+                [["example1.pdf"]],
+            ],
+            inputs=[file_input],
+            label="Example Usage"
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )