Spaces:

abinash73
/

api-olmocr-api

Running

File size: 12,011 Bytes

import gradio as gr
import json
import re
from datetime import datetime
from paddleocr import PaddleOCR
from PIL import Image
import pdf2image
import numpy as np

# Initialize PaddleOCR
ocr = PaddleOCR(use_textline_orientation=True, lang='en')

def extract_text_from_image(image):
    """Extract text from image using PaddleOCR"""
    if isinstance(image, Image.Image):
        image = np.array(image)
    
    result = ocr.ocr(image)
    
    # Check if result is valid
    if not result or not result[0]:
        return []
    
    # Extract text with coordinates
    text_blocks = []
    for line in result[0]:
        if not line or len(line) < 2:
            continue
        
        try:
            bbox = line[0]
            text_info = line[1]
            
            # Handle different formats
            if isinstance(text_info, (tuple, list)):
                text = text_info[0]
                confidence = text_info[1] if len(text_info) > 1 else 0.0
            else:
                text = str(text_info)
                confidence = 0.0
            
            # bbox should be a list of 4 points [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
                continue
            
            # Calculate center point for positioning
            y_center = (bbox[0][1] + bbox[2][1]) / 2
            x_center = (bbox[0][0] + bbox[2][0]) / 2
            
            text_blocks.append({
                'text': text,
                'y': y_center,
                'x': x_center,
                'confidence': confidence
            })
        except (IndexError, TypeError, KeyError) as e:
            # Skip problematic entries
            continue
    
    return text_blocks

def pdf_to_images(pdf_file):
    """Convert PDF to images"""
    images = pdf2image.convert_from_path(pdf_file)
    return images

def extract_gstin(text):
    """Extract GSTIN using pattern matching"""
    gstin_pattern = r'\d{2}[A-Z]{5}\d{4}[A-Z]{1}[A-Z\d]{1}[Z]{1}[A-Z\d]{1}'
    match = re.search(gstin_pattern, text)
    return match.group(0) if match else None

def extract_pincode(text):
    """Extract 6-digit PIN code"""
    pincode_pattern = r'\b\d{6}\b'
    match = re.search(pincode_pattern, text)
    return match.group(0) if match else None

def extract_mobile(text):
    """Extract mobile number"""
    mobile_pattern = r'\b[6-9]\d{9}\b'
    match = re.search(mobile_pattern, text)
    return match.group(0) if match else None

def extract_date(text):
    """Extract date in various formats"""
    date_patterns = [
        r'\d{2}[-/]\d{2}[-/]\d{4}',
        r'\d{2}[-/]\d{2}[-/]\d{2}',
        r'\d{4}[-/]\d{2}[-/]\d{2}'
    ]
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

def extract_invoice_number(text_blocks):
    """Extract invoice/bill number"""
    for block in text_blocks:
        text = block['text']
        if re.search(r'(invoice|bill)\s*(no|number|#)', text.lower()):
            # Look for number in same or next block
            number_match = re.search(r'[A-Z0-9/-]+', text)
            if number_match:
                return number_match.group(0)
    return None

def extract_amounts(text):
    """Extract monetary amounts"""
    amount_pattern = r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)'
    amounts = re.findall(amount_pattern, text)
    return [float(amt.replace(',', '')) for amt in amounts]

def find_header_info(text_blocks):
    """Extract header information (supplier details)"""
    all_text = ' '.join([block['text'] for block in text_blocks])
    
    header = {
        "supplier_name": None,
        "supplier_pincode": extract_pincode(all_text),
        "gstin": extract_gstin(all_text),
        "contact_no": extract_mobile(all_text),
        "invoice_no": extract_invoice_number(text_blocks),
        "invoice_date": extract_date(all_text)
    }
    
    # Extract supplier name (usually first few lines)
    top_blocks = sorted(text_blocks, key=lambda x: x['y'])[:5]
    supplier_name_candidates = []
    for block in top_blocks:
        text = block['text'].strip()
        if len(text) > 3 and not re.match(r'^[\d\s.,]+$', text):
            supplier_name_candidates.append(text)
    
    if supplier_name_candidates:
        header['supplier_name'] = supplier_name_candidates[0]
    
    return header

def find_line_items(text_blocks):
    """Extract line items from invoice"""
    # Sort blocks by Y coordinate
    sorted_blocks = sorted(text_blocks, key=lambda x: x['y'])
    
    items = []
    current_item = {}
    
    # Simple heuristic: Look for patterns
    for i, block in enumerate(sorted_blocks):
        text = block['text'].strip()
        
        # Look for HSN codes (6 or 8 digits)
        hsn_match = re.search(r'\b\d{4,8}\b', text)
        if hsn_match and not current_item.get('hsn'):
            current_item['hsn'] = hsn_match.group(0)
        
        # Look for quantities
        qty_match = re.search(r'\b(\d+(?:\.\d+)?)\s*(pcs|nos|kg|ltr|box|unit)?', text.lower())
        if qty_match and not current_item.get('qty'):
            current_item['qty'] = float(qty_match.group(1))
            current_item['unit'] = qty_match.group(2) if qty_match.group(2) else 'Nos'
        
        # Look for rates/amounts
        amount_matches = re.findall(r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)', text)
        if amount_matches:
            amounts = [float(amt.replace(',', '')) for amt in amount_matches]
            if not current_item.get('rate') and len(amounts) > 0:
                current_item['rate'] = amounts[0]
        
        # Look for GST percentages
        gst_match = re.search(r'(\d+(?:\.\d+)?)\s*%', text)
        if gst_match and not current_item.get('gst_percent'):
            current_item['gst_percent'] = float(gst_match.group(1))
        
        # If we have enough info, save item
        if len(current_item) >= 3:
            if 'item_name' not in current_item:
                current_item['item_name'] = text[:50]
            
            items.append({
                'item_name': current_item.get('item_name', 'Item'),
                'hsn': current_item.get('hsn', ''),
                'qty': current_item.get('qty', 0),
                'unit': current_item.get('unit', 'Nos'),
                'rate': current_item.get('rate', 0),
                'discount': current_item.get('discount', 0),
                'gst_percent': current_item.get('gst_percent', 0)
            })
            current_item = {}
    
    return items

def calculate_totals(items):
    """Calculate totals from line items"""
    total_gross = 0
    total_taxable = 0
    total_gst = 0
    
    for item in items:
        qty = item.get('qty', 0)
        rate = item.get('rate', 0)
        discount = item.get('discount', 0)
        gst_percent = item.get('gst_percent', 0)
        
        gross = qty * rate
        taxable = gross - discount
        gst_amount = (taxable * gst_percent) / 100
        
        item['gross_amount'] = round(gross, 2)
        item['taxable_amount'] = round(taxable, 2)
        item['gst_amount'] = round(gst_amount, 2)
        item['total_amount'] = round(taxable + gst_amount, 2)
        
        total_gross += gross
        total_taxable += taxable
        total_gst += gst_amount
    
    return {
        'total_gross': round(total_gross, 2),
        'total_taxable': round(total_taxable, 2),
        'total_gst': round(total_gst, 2),
        'grand_total': round(total_taxable + total_gst, 2)
    }

def extract_invoice_data(file):
    """Main function to extract all invoice data"""
    try:
        # Convert PDF to image if needed
        if file.name.lower().endswith('.pdf'):
            images = pdf_to_images(file.name)
            image = images[0]  # Process first page
        else:
            image = Image.open(file.name)
        
        # Extract text with OCR
        text_blocks = extract_text_from_image(image)
        
        # Check if OCR extracted any text
        if not text_blocks:
            return json.dumps({
                "error": "No text detected",
                "message": "Could not extract any text from the image. Please ensure the image is clear and contains text."
            }, indent=2)
        
        # Extract different sections
        header = find_header_info(text_blocks)
        details = find_line_items(text_blocks)
        footer = calculate_totals(details)
        
        # Build final JSON structure
        result = {
            "header": header,
            "details": details,
            "footer": footer,
            "debug_info": {
                "total_text_blocks": len(text_blocks),
                "sample_text": [block['text'] for block in text_blocks[:5]]
            }
        }
        
        return json.dumps(result, indent=2, ensure_ascii=False)
        
    except Exception as e:
        import traceback
        return json.dumps({
            "error": str(e),
            "error_type": type(e).__name__,
            "traceback": traceback.format_exc(),
            "message": "Failed to process invoice"
        }, indent=2)

# Create Gradio Interface
with gr.Blocks(title="Purchase Invoice Data Extraction", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🧾 Purchase Invoice Data Extraction API
    
    Upload purchase invoices (PDF or Image) to automatically extract structured data including:
    - Supplier details (Name, PIN, GSTIN, Contact)
    - Invoice information (Number, Date)
    - Line items (Name, HSN, Qty, Rate, Discounts, GST%)
    - Calculated totals (Gross, Taxable, Tax, Grand Total)
    """)
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="Upload Invoice (PDF or Image)",
                file_types=[".pdf", ".png", ".jpg", ".jpeg"]
            )
            extract_btn = gr.Button("Extract Data", variant="primary", size="lg")
            
            gr.Markdown("""
            ### Supported Formats:
            - PDF documents
            - PNG, JPG, JPEG images
            - English and Hindi text
            """)
        
        with gr.Column():
            output_json = gr.Code(
                label="Extracted Data (JSON)",
                language="json",
                lines=25
            )
    
    gr.Markdown("""
    ### Output Structure:
    ```json
    {
      "header": {
        "supplier_name": "...",
        "supplier_pincode": "...",
        "gstin": "...",
        "contact_no": "...",
        "invoice_no": "...",
        "invoice_date": "..."
      },
      "details": [
        {
          "item_name": "...",
          "hsn": "...",
          "qty": 0,
          "unit": "...",
          "rate": 0,
          "discount": 0,
          "gst_percent": 0,
          "gross_amount": 0,
          "taxable_amount": 0,
          "gst_amount": 0,
          "total_amount": 0
        }
      ],
      "footer": {
        "total_gross": 0,
        "total_taxable": 0,
        "total_gst": 0,
        "grand_total": 0
      }
    }
    ```
    
    ---
    
    ### API Usage:
    
    **Python Client:**
    ```python
    from gradio_client import Client
    
    client = Client("http://localhost:7860")
    result = client.predict(
        file="path/to/invoice.pdf",
        api_name="/predict"
    )
    print(result)
    ```
    
    **cURL:**
    ```bash
    curl -X POST http://localhost:7860/api/predict \\
      -F "file=@invoice.pdf"
    ```
    """)
    
    extract_btn.click(
        fn=extract_invoice_data,
        inputs=[file_input],
        outputs=[output_json]
    )
    
    # Example usage
    gr.Examples(
        examples=[],
        inputs=[file_input],
        outputs=[output_json],
        fn=extract_invoice_data,
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_api=True
    )