Spaces:

prasannahf
/

Invoice_data_extraction

Sleeping

File size: 7,691 Bytes

fb5ab2d

import gradio as gr
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from pdf2image import convert_from_path
from PIL import Image
import numpy as np
import tempfile
import os
import io
import base64
from together import Together
import json

# Load OCR model once
model = ocr_predictor(pretrained=True)

# Your upload_and_encode function (modified for Gradio)
def upload_and_encode(file_path):
    if file_path.lower().endswith('.pdf'):
        images = convert_from_path(file_path, dpi=300, first_page=1, last_page=1)
        image = images[0]
    else:
        image = Image.open(file_path)

    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

def process_document(uploaded_file, together_api_key):
    if uploaded_file is None:
        return "Please upload a file."

    # Save uploaded file temporarily
    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
        tmp_file.write(uploaded_file.read())
        file_path = tmp_file.name

    # Run OCR (full document, as in your notebook)
    pages = []
    if file_path.lower().endswith('.pdf'):
        images = convert_from_path(file_path, dpi=300)
        pages = [np.array(img) for img in images]
    else:
        pages = [np.array(Image.open(file_path).convert("RGB"))]

    extracted_texts = []
    for page_num, image in enumerate(pages, 1):
        result = model([image])
        text_output = result.render()
        extracted_texts.append(text_output)

    full_text_output = "\n".join(extracted_texts)  # Combine all pages' text

    # Get base64 image (using first page, as in your code)
    base64_image = upload_and_encode(file_path)

    # Clean up temp file
    os.unlink(file_path)

    # Call Together AI LLM
    client = Together(api_key=together_api_key)
    response = client.chat.completions.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": """
You are the world's most accurate invoice data extraction expert, capable of processing ANY Indian business document format.

🎯 MISSION: Extract ALL information from this document (Invoice/Credit Note/Debit Note/Tax Invoice/e-Invoice/RCM Invoice).

🔍 CRITICAL RULES:
- Use the IMAGE as PRIMARY source - correct OCR errors you can see
- Extract EVERY line item, charge, tax, and cess found in tables
- Handle multiple document types: Tax Invoice, Credit Note, Debit Note, RCM Invoice, e-Invoice, Railway Invoice
- Return ONLY valid JSON - no explanations, no markdown
- Use null for missing fields - NEVER guess or hallucinate
- Preserve exact number formatting as strings
- Extract government-specific fields (IRN, Ack No, e-way bill, etc.)

📋 UNIVERSAL JSON SCHEMA:
{
  "document_type": "string (Tax Invoice/Credit Note/Debit Note/RCM Invoice/e-Invoice)",
  "document_info": {
    "invoice_number": "string or null",
    "document_number": "string or null",
    "invoice_date": "string or null",
    "document_date": "string or null",
    "po_number": "string or null",
    "internal_ref_no": "string or null",
    "place_of_supply": "string or null",
    "bill_period_from": "string or null",
    "bill_period_to": "string or null",
    "reverse_charge_applicable": "string or null"
  },
  "government_fields": {
    "irn": "string or null",
    "ack_no": "string or null",
    "ack_date": "string or null",
    "eway_bill_no": "string or null",
    "eway_bill_date": "string or null",
    "cin": "string or null"
  },
  "supplier": {
    "name": "string or null",
    "address": "string or null",
    "gstin": "string or null",
    "pan": "string or null",
    "state": "string or null",
    "state_code": "string or null",
    "contact": {
      "email": "string or null",
      "phone": "string or null",
      "fax": "string or null"
    }
  },
  "customer": {
    "name": "string or null",
    "address": "string or null",
    "gstin": "string or null",
    "pan": "string or null",
    "state": "string or null",
    "state_code": "string or null",
    "customer_code": "string or null"
  },
  "consignee": {
    "name": "string or null",
    "address": "string or null",
    "gstin": "string or null",
    "state": "string or null"
  },
  "line_items": [
    {
      "sl_no": "string or null",
      "description": "string",
      "hsn_sac_code": "string or null",
      "uom": "string or null",
      "quantity": "string or null",
      "rate": "string or null",
      "amount": "string or null",
      "taxable_value": "string or null",
      "cgst_rate": "string or null",
      "cgst_amount": "string or null",
      "sgst_rate": "string or null",
      "sgst_amount": "string or null",
      "igst_rate": "string or null",
      "igst_amount": "string or null",
      "cess_rate": "string or null",
      "cess_amount": "string or null"
    }
  ],
  "additional_charges": [
    {
      "description": "string",
      "amount": "string",
      "type": "string (freight/packing/handling/penalty/bonus/escalation/cess etc.)"
    }
  ],
  "financial_totals": {
    "subtotal": "string or null",
    "total_taxable_amount": "string or null",
    "total_cgst": "string or null",
    "total_sgst": "string or null",
    "total_igst": "string or null",
    "total_cess": "string or null",
    "infrastructure_cess": "string or null",
    "environmental_cess": "string or null",
    "forest_permit_fee": "string or null",
    "total_tax_amount": "string or null",
    "round_off": "string or null",
    "total_invoice_amount": "string or null",
    "amount_in_words": {
      "tax_amount": "string or null",
      "total_amount": "string or null"
    }
  },
  "transport_details": {
    "mode_of_dispatch": "string or null",
    "vehicle_no": "string or null",
    "lr_rr_no": "string or null",
    "lr_rr_date": "string or null",
    "transporter": "string or null",
    "destination": "string or null"
  },
  "work_commodity_details": {
    "work_description": "string or null",
    "commodity": "string or null",
    "fe_percentage": "string or null",
    "batch_lot_no": "string or null"
  },
  "payment_terms": {
    "terms": "string or null",
    "due_date": "string or null"
  },
  "remarks_notes": "string or null"
}

⚠️ EXTRACT EVERYTHING: Don't skip penalty amounts, cess charges, infrastructure fees, environmental charges, bonus payments, escalation amounts, or any government-mandated fields.

Return the JSON only:
"""
                },
                {"type": "text", "text": f"OCR REFERENCE: {full_text_output}"},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
            ]
        }],
        max_tokens=3000,
        temperature=0.05,
        top_p=0.9
    )

    # Get and format the JSON response
    try:
        json_response = json.loads(response.choices[0].message.content)
        return json.dumps(json_response, indent=4)
    except:
        return "Error: Invalid JSON from LLM. Raw output: " + response.choices[0].message.content

# Gradio interface
iface = gr.Interface(
    fn=process_document,
    inputs=[
        gr.File(label="Upload PDF or Image"),
        gr.Textbox(label="Together AI API Key", type="password")  # For testing; use secrets in production
    ],
    outputs=gr.Textbox(label="Extracted Invoice JSON"),
    title="Invoice OCR & Extraction App",
    description="Upload a document to extract text via OCR and structured data via open-source LLM."
)

if __name__ == "__main__":
    iface.launch()