Spaces:
Running
Running
| import gradio as gr | |
| import json | |
| import re | |
| from datetime import datetime | |
| from paddleocr import PaddleOCR | |
| from PIL import Image | |
| import pdf2image | |
| import numpy as np | |
| # Initialize PaddleOCR | |
| ocr = PaddleOCR(use_textline_orientation=True, lang='en') | |
| def extract_text_from_image(image): | |
| """Extract text from image using PaddleOCR""" | |
| if isinstance(image, Image.Image): | |
| image = np.array(image) | |
| result = ocr.ocr(image) | |
| # Check if result is valid | |
| if not result or not result[0]: | |
| return [] | |
| # Extract text with coordinates | |
| text_blocks = [] | |
| for line in result[0]: | |
| if not line or len(line) < 2: | |
| continue | |
| try: | |
| bbox = line[0] | |
| text_info = line[1] | |
| # Handle different formats | |
| if isinstance(text_info, (tuple, list)): | |
| text = text_info[0] | |
| confidence = text_info[1] if len(text_info) > 1 else 0.0 | |
| else: | |
| text = str(text_info) | |
| confidence = 0.0 | |
| # bbox should be a list of 4 points [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] | |
| if not isinstance(bbox, (list, tuple)) or len(bbox) < 4: | |
| continue | |
| # Calculate center point for positioning | |
| y_center = (bbox[0][1] + bbox[2][1]) / 2 | |
| x_center = (bbox[0][0] + bbox[2][0]) / 2 | |
| text_blocks.append({ | |
| 'text': text, | |
| 'y': y_center, | |
| 'x': x_center, | |
| 'confidence': confidence | |
| }) | |
| except (IndexError, TypeError, KeyError) as e: | |
| # Skip problematic entries | |
| continue | |
| return text_blocks | |
| def pdf_to_images(pdf_file): | |
| """Convert PDF to images""" | |
| images = pdf2image.convert_from_path(pdf_file) | |
| return images | |
| def extract_gstin(text): | |
| """Extract GSTIN using pattern matching""" | |
| gstin_pattern = r'\d{2}[A-Z]{5}\d{4}[A-Z]{1}[A-Z\d]{1}[Z]{1}[A-Z\d]{1}' | |
| match = re.search(gstin_pattern, text) | |
| return match.group(0) if match else None | |
| def extract_pincode(text): | |
| """Extract 6-digit PIN code""" | |
| pincode_pattern = r'\b\d{6}\b' | |
| match = re.search(pincode_pattern, text) | |
| return match.group(0) if match else None | |
| def extract_mobile(text): | |
| """Extract mobile number""" | |
| mobile_pattern = r'\b[6-9]\d{9}\b' | |
| match = re.search(mobile_pattern, text) | |
| return match.group(0) if match else None | |
| def extract_date(text): | |
| """Extract date in various formats""" | |
| date_patterns = [ | |
| r'\d{2}[-/]\d{2}[-/]\d{4}', | |
| r'\d{2}[-/]\d{2}[-/]\d{2}', | |
| r'\d{4}[-/]\d{2}[-/]\d{2}' | |
| ] | |
| for pattern in date_patterns: | |
| match = re.search(pattern, text) | |
| if match: | |
| return match.group(0) | |
| return None | |
| def extract_invoice_number(text_blocks): | |
| """Extract invoice/bill number""" | |
| for block in text_blocks: | |
| text = block['text'] | |
| if re.search(r'(invoice|bill)\s*(no|number|#)', text.lower()): | |
| # Look for number in same or next block | |
| number_match = re.search(r'[A-Z0-9/-]+', text) | |
| if number_match: | |
| return number_match.group(0) | |
| return None | |
| def extract_amounts(text): | |
| """Extract monetary amounts""" | |
| amount_pattern = r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)' | |
| amounts = re.findall(amount_pattern, text) | |
| return [float(amt.replace(',', '')) for amt in amounts] | |
| def find_header_info(text_blocks): | |
| """Extract header information (supplier details)""" | |
| all_text = ' '.join([block['text'] for block in text_blocks]) | |
| header = { | |
| "supplier_name": None, | |
| "supplier_pincode": extract_pincode(all_text), | |
| "gstin": extract_gstin(all_text), | |
| "contact_no": extract_mobile(all_text), | |
| "invoice_no": extract_invoice_number(text_blocks), | |
| "invoice_date": extract_date(all_text) | |
| } | |
| # Extract supplier name (usually first few lines) | |
| top_blocks = sorted(text_blocks, key=lambda x: x['y'])[:5] | |
| supplier_name_candidates = [] | |
| for block in top_blocks: | |
| text = block['text'].strip() | |
| if len(text) > 3 and not re.match(r'^[\d\s.,]+$', text): | |
| supplier_name_candidates.append(text) | |
| if supplier_name_candidates: | |
| header['supplier_name'] = supplier_name_candidates[0] | |
| return header | |
| def find_line_items(text_blocks): | |
| """Extract line items from invoice""" | |
| # Sort blocks by Y coordinate | |
| sorted_blocks = sorted(text_blocks, key=lambda x: x['y']) | |
| items = [] | |
| current_item = {} | |
| # Simple heuristic: Look for patterns | |
| for i, block in enumerate(sorted_blocks): | |
| text = block['text'].strip() | |
| # Look for HSN codes (6 or 8 digits) | |
| hsn_match = re.search(r'\b\d{4,8}\b', text) | |
| if hsn_match and not current_item.get('hsn'): | |
| current_item['hsn'] = hsn_match.group(0) | |
| # Look for quantities | |
| qty_match = re.search(r'\b(\d+(?:\.\d+)?)\s*(pcs|nos|kg|ltr|box|unit)?', text.lower()) | |
| if qty_match and not current_item.get('qty'): | |
| current_item['qty'] = float(qty_match.group(1)) | |
| current_item['unit'] = qty_match.group(2) if qty_match.group(2) else 'Nos' | |
| # Look for rates/amounts | |
| amount_matches = re.findall(r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)', text) | |
| if amount_matches: | |
| amounts = [float(amt.replace(',', '')) for amt in amount_matches] | |
| if not current_item.get('rate') and len(amounts) > 0: | |
| current_item['rate'] = amounts[0] | |
| # Look for GST percentages | |
| gst_match = re.search(r'(\d+(?:\.\d+)?)\s*%', text) | |
| if gst_match and not current_item.get('gst_percent'): | |
| current_item['gst_percent'] = float(gst_match.group(1)) | |
| # If we have enough info, save item | |
| if len(current_item) >= 3: | |
| if 'item_name' not in current_item: | |
| current_item['item_name'] = text[:50] | |
| items.append({ | |
| 'item_name': current_item.get('item_name', 'Item'), | |
| 'hsn': current_item.get('hsn', ''), | |
| 'qty': current_item.get('qty', 0), | |
| 'unit': current_item.get('unit', 'Nos'), | |
| 'rate': current_item.get('rate', 0), | |
| 'discount': current_item.get('discount', 0), | |
| 'gst_percent': current_item.get('gst_percent', 0) | |
| }) | |
| current_item = {} | |
| return items | |
| def calculate_totals(items): | |
| """Calculate totals from line items""" | |
| total_gross = 0 | |
| total_taxable = 0 | |
| total_gst = 0 | |
| for item in items: | |
| qty = item.get('qty', 0) | |
| rate = item.get('rate', 0) | |
| discount = item.get('discount', 0) | |
| gst_percent = item.get('gst_percent', 0) | |
| gross = qty * rate | |
| taxable = gross - discount | |
| gst_amount = (taxable * gst_percent) / 100 | |
| item['gross_amount'] = round(gross, 2) | |
| item['taxable_amount'] = round(taxable, 2) | |
| item['gst_amount'] = round(gst_amount, 2) | |
| item['total_amount'] = round(taxable + gst_amount, 2) | |
| total_gross += gross | |
| total_taxable += taxable | |
| total_gst += gst_amount | |
| return { | |
| 'total_gross': round(total_gross, 2), | |
| 'total_taxable': round(total_taxable, 2), | |
| 'total_gst': round(total_gst, 2), | |
| 'grand_total': round(total_taxable + total_gst, 2) | |
| } | |
| def extract_invoice_data(file): | |
| """Main function to extract all invoice data""" | |
| try: | |
| # Convert PDF to image if needed | |
| if file.name.lower().endswith('.pdf'): | |
| images = pdf_to_images(file.name) | |
| image = images[0] # Process first page | |
| else: | |
| image = Image.open(file.name) | |
| # Extract text with OCR | |
| text_blocks = extract_text_from_image(image) | |
| # Check if OCR extracted any text | |
| if not text_blocks: | |
| return json.dumps({ | |
| "error": "No text detected", | |
| "message": "Could not extract any text from the image. Please ensure the image is clear and contains text." | |
| }, indent=2) | |
| # Extract different sections | |
| header = find_header_info(text_blocks) | |
| details = find_line_items(text_blocks) | |
| footer = calculate_totals(details) | |
| # Build final JSON structure | |
| result = { | |
| "header": header, | |
| "details": details, | |
| "footer": footer, | |
| "debug_info": { | |
| "total_text_blocks": len(text_blocks), | |
| "sample_text": [block['text'] for block in text_blocks[:5]] | |
| } | |
| } | |
| return json.dumps(result, indent=2, ensure_ascii=False) | |
| except Exception as e: | |
| import traceback | |
| return json.dumps({ | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "traceback": traceback.format_exc(), | |
| "message": "Failed to process invoice" | |
| }, indent=2) | |
| # Create Gradio Interface | |
| with gr.Blocks(title="Purchase Invoice Data Extraction", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # 🧾 Purchase Invoice Data Extraction API | |
| Upload purchase invoices (PDF or Image) to automatically extract structured data including: | |
| - Supplier details (Name, PIN, GSTIN, Contact) | |
| - Invoice information (Number, Date) | |
| - Line items (Name, HSN, Qty, Rate, Discounts, GST%) | |
| - Calculated totals (Gross, Taxable, Tax, Grand Total) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload Invoice (PDF or Image)", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg"] | |
| ) | |
| extract_btn = gr.Button("Extract Data", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| ### Supported Formats: | |
| - PDF documents | |
| - PNG, JPG, JPEG images | |
| - English and Hindi text | |
| """) | |
| with gr.Column(): | |
| output_json = gr.Code( | |
| label="Extracted Data (JSON)", | |
| language="json", | |
| lines=25 | |
| ) | |
| gr.Markdown(""" | |
| ### Output Structure: | |
| ```json | |
| { | |
| "header": { | |
| "supplier_name": "...", | |
| "supplier_pincode": "...", | |
| "gstin": "...", | |
| "contact_no": "...", | |
| "invoice_no": "...", | |
| "invoice_date": "..." | |
| }, | |
| "details": [ | |
| { | |
| "item_name": "...", | |
| "hsn": "...", | |
| "qty": 0, | |
| "unit": "...", | |
| "rate": 0, | |
| "discount": 0, | |
| "gst_percent": 0, | |
| "gross_amount": 0, | |
| "taxable_amount": 0, | |
| "gst_amount": 0, | |
| "total_amount": 0 | |
| } | |
| ], | |
| "footer": { | |
| "total_gross": 0, | |
| "total_taxable": 0, | |
| "total_gst": 0, | |
| "grand_total": 0 | |
| } | |
| } | |
| ``` | |
| --- | |
| ### API Usage: | |
| **Python Client:** | |
| ```python | |
| from gradio_client import Client | |
| client = Client("http://localhost:7860") | |
| result = client.predict( | |
| file="path/to/invoice.pdf", | |
| api_name="/predict" | |
| ) | |
| print(result) | |
| ``` | |
| **cURL:** | |
| ```bash | |
| curl -X POST http://localhost:7860/api/predict \\ | |
| -F "file=@invoice.pdf" | |
| ``` | |
| """) | |
| extract_btn.click( | |
| fn=extract_invoice_data, | |
| inputs=[file_input], | |
| outputs=[output_json] | |
| ) | |
| # Example usage | |
| gr.Examples( | |
| examples=[], | |
| inputs=[file_input], | |
| outputs=[output_json], | |
| fn=extract_invoice_data, | |
| cache_examples=False | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_api=True | |
| ) |