import gradio as gr import json import re from datetime import datetime from paddleocr import PaddleOCR from PIL import Image import pdf2image import numpy as np # Initialize PaddleOCR ocr = PaddleOCR(use_textline_orientation=True, lang='en') def extract_text_from_image(image): """Extract text from image using PaddleOCR""" if isinstance(image, Image.Image): image = np.array(image) result = ocr.ocr(image) # Check if result is valid if not result or not result[0]: return [] # Extract text with coordinates text_blocks = [] for line in result[0]: if not line or len(line) < 2: continue try: bbox = line[0] text_info = line[1] # Handle different formats if isinstance(text_info, (tuple, list)): text = text_info[0] confidence = text_info[1] if len(text_info) > 1 else 0.0 else: text = str(text_info) confidence = 0.0 # bbox should be a list of 4 points [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] if not isinstance(bbox, (list, tuple)) or len(bbox) < 4: continue # Calculate center point for positioning y_center = (bbox[0][1] + bbox[2][1]) / 2 x_center = (bbox[0][0] + bbox[2][0]) / 2 text_blocks.append({ 'text': text, 'y': y_center, 'x': x_center, 'confidence': confidence }) except (IndexError, TypeError, KeyError) as e: # Skip problematic entries continue return text_blocks def pdf_to_images(pdf_file): """Convert PDF to images""" images = pdf2image.convert_from_path(pdf_file) return images def extract_gstin(text): """Extract GSTIN using pattern matching""" gstin_pattern = r'\d{2}[A-Z]{5}\d{4}[A-Z]{1}[A-Z\d]{1}[Z]{1}[A-Z\d]{1}' match = re.search(gstin_pattern, text) return match.group(0) if match else None def extract_pincode(text): """Extract 6-digit PIN code""" pincode_pattern = r'\b\d{6}\b' match = re.search(pincode_pattern, text) return match.group(0) if match else None def extract_mobile(text): """Extract mobile number""" mobile_pattern = r'\b[6-9]\d{9}\b' match = re.search(mobile_pattern, text) return match.group(0) if match else None def extract_date(text): """Extract date in various formats""" date_patterns = [ r'\d{2}[-/]\d{2}[-/]\d{4}', r'\d{2}[-/]\d{2}[-/]\d{2}', r'\d{4}[-/]\d{2}[-/]\d{2}' ] for pattern in date_patterns: match = re.search(pattern, text) if match: return match.group(0) return None def extract_invoice_number(text_blocks): """Extract invoice/bill number""" for block in text_blocks: text = block['text'] if re.search(r'(invoice|bill)\s*(no|number|#)', text.lower()): # Look for number in same or next block number_match = re.search(r'[A-Z0-9/-]+', text) if number_match: return number_match.group(0) return None def extract_amounts(text): """Extract monetary amounts""" amount_pattern = r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)' amounts = re.findall(amount_pattern, text) return [float(amt.replace(',', '')) for amt in amounts] def find_header_info(text_blocks): """Extract header information (supplier details)""" all_text = ' '.join([block['text'] for block in text_blocks]) header = { "supplier_name": None, "supplier_pincode": extract_pincode(all_text), "gstin": extract_gstin(all_text), "contact_no": extract_mobile(all_text), "invoice_no": extract_invoice_number(text_blocks), "invoice_date": extract_date(all_text) } # Extract supplier name (usually first few lines) top_blocks = sorted(text_blocks, key=lambda x: x['y'])[:5] supplier_name_candidates = [] for block in top_blocks: text = block['text'].strip() if len(text) > 3 and not re.match(r'^[\d\s.,]+$', text): supplier_name_candidates.append(text) if supplier_name_candidates: header['supplier_name'] = supplier_name_candidates[0] return header def find_line_items(text_blocks): """Extract line items from invoice""" # Sort blocks by Y coordinate sorted_blocks = sorted(text_blocks, key=lambda x: x['y']) items = [] current_item = {} # Simple heuristic: Look for patterns for i, block in enumerate(sorted_blocks): text = block['text'].strip() # Look for HSN codes (6 or 8 digits) hsn_match = re.search(r'\b\d{4,8}\b', text) if hsn_match and not current_item.get('hsn'): current_item['hsn'] = hsn_match.group(0) # Look for quantities qty_match = re.search(r'\b(\d+(?:\.\d+)?)\s*(pcs|nos|kg|ltr|box|unit)?', text.lower()) if qty_match and not current_item.get('qty'): current_item['qty'] = float(qty_match.group(1)) current_item['unit'] = qty_match.group(2) if qty_match.group(2) else 'Nos' # Look for rates/amounts amount_matches = re.findall(r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)', text) if amount_matches: amounts = [float(amt.replace(',', '')) for amt in amount_matches] if not current_item.get('rate') and len(amounts) > 0: current_item['rate'] = amounts[0] # Look for GST percentages gst_match = re.search(r'(\d+(?:\.\d+)?)\s*%', text) if gst_match and not current_item.get('gst_percent'): current_item['gst_percent'] = float(gst_match.group(1)) # If we have enough info, save item if len(current_item) >= 3: if 'item_name' not in current_item: current_item['item_name'] = text[:50] items.append({ 'item_name': current_item.get('item_name', 'Item'), 'hsn': current_item.get('hsn', ''), 'qty': current_item.get('qty', 0), 'unit': current_item.get('unit', 'Nos'), 'rate': current_item.get('rate', 0), 'discount': current_item.get('discount', 0), 'gst_percent': current_item.get('gst_percent', 0) }) current_item = {} return items def calculate_totals(items): """Calculate totals from line items""" total_gross = 0 total_taxable = 0 total_gst = 0 for item in items: qty = item.get('qty', 0) rate = item.get('rate', 0) discount = item.get('discount', 0) gst_percent = item.get('gst_percent', 0) gross = qty * rate taxable = gross - discount gst_amount = (taxable * gst_percent) / 100 item['gross_amount'] = round(gross, 2) item['taxable_amount'] = round(taxable, 2) item['gst_amount'] = round(gst_amount, 2) item['total_amount'] = round(taxable + gst_amount, 2) total_gross += gross total_taxable += taxable total_gst += gst_amount return { 'total_gross': round(total_gross, 2), 'total_taxable': round(total_taxable, 2), 'total_gst': round(total_gst, 2), 'grand_total': round(total_taxable + total_gst, 2) } def extract_invoice_data(file): """Main function to extract all invoice data""" try: # Convert PDF to image if needed if file.name.lower().endswith('.pdf'): images = pdf_to_images(file.name) image = images[0] # Process first page else: image = Image.open(file.name) # Extract text with OCR text_blocks = extract_text_from_image(image) # Check if OCR extracted any text if not text_blocks: return json.dumps({ "error": "No text detected", "message": "Could not extract any text from the image. Please ensure the image is clear and contains text." }, indent=2) # Extract different sections header = find_header_info(text_blocks) details = find_line_items(text_blocks) footer = calculate_totals(details) # Build final JSON structure result = { "header": header, "details": details, "footer": footer, "debug_info": { "total_text_blocks": len(text_blocks), "sample_text": [block['text'] for block in text_blocks[:5]] } } return json.dumps(result, indent=2, ensure_ascii=False) except Exception as e: import traceback return json.dumps({ "error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc(), "message": "Failed to process invoice" }, indent=2) # Create Gradio Interface with gr.Blocks(title="Purchase Invoice Data Extraction", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🧾 Purchase Invoice Data Extraction API Upload purchase invoices (PDF or Image) to automatically extract structured data including: - Supplier details (Name, PIN, GSTIN, Contact) - Invoice information (Number, Date) - Line items (Name, HSN, Qty, Rate, Discounts, GST%) - Calculated totals (Gross, Taxable, Tax, Grand Total) """) with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload Invoice (PDF or Image)", file_types=[".pdf", ".png", ".jpg", ".jpeg"] ) extract_btn = gr.Button("Extract Data", variant="primary", size="lg") gr.Markdown(""" ### Supported Formats: - PDF documents - PNG, JPG, JPEG images - English and Hindi text """) with gr.Column(): output_json = gr.Code( label="Extracted Data (JSON)", language="json", lines=25 ) gr.Markdown(""" ### Output Structure: ```json { "header": { "supplier_name": "...", "supplier_pincode": "...", "gstin": "...", "contact_no": "...", "invoice_no": "...", "invoice_date": "..." }, "details": [ { "item_name": "...", "hsn": "...", "qty": 0, "unit": "...", "rate": 0, "discount": 0, "gst_percent": 0, "gross_amount": 0, "taxable_amount": 0, "gst_amount": 0, "total_amount": 0 } ], "footer": { "total_gross": 0, "total_taxable": 0, "total_gst": 0, "grand_total": 0 } } ``` --- ### API Usage: **Python Client:** ```python from gradio_client import Client client = Client("http://localhost:7860") result = client.predict( file="path/to/invoice.pdf", api_name="/predict" ) print(result) ``` **cURL:** ```bash curl -X POST http://localhost:7860/api/predict \\ -F "file=@invoice.pdf" ``` """) extract_btn.click( fn=extract_invoice_data, inputs=[file_input], outputs=[output_json] ) # Example usage gr.Examples( examples=[], inputs=[file_input], outputs=[output_json], fn=extract_invoice_data, cache_examples=False ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_api=True )