Spaces:

abinash73
/

inv-ocr

Sleeping

App Files Files Community

inv-ocr / app.py

abinash73

Update app.py

6ac6ef8 verified 6 months ago

raw

history blame contribute delete

19.9 kB

	# ================================
	# FIX PIL ANTIALIAS (new Pillow versions)
	# ================================
	from PIL import Image
	if not hasattr(Image, "ANTIALIAS"):
	Image.ANTIALIAS = Image.Resampling.LANCZOS

	# ================================
	# IMPORTS
	# ================================
	import gradio as gr
	import easyocr
	import fitz # PyMuPDF
	import numpy as np
	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	from PIL import Image as PILImage
	import re
	import os
	import tempfile


	# =====================================================
	# LOAD OCR MODEL (only once)
	# =====================================================
	print("Loading EasyOCR model...")
	reader = easyocr.Reader(["en"], gpu=False)
	print("Model loaded successfully!")


	# =====================================================
	# PDF → IMAGE CONVERSION
	# =====================================================
	def pdf_to_images(pdf_path):
	images = []
	try:
	doc = fitz.open(pdf_path)
	for page in doc:
	pix = page.get_pixmap(dpi=200)
	img = PILImage.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)
	except Exception as e:
	print("PDF ERROR:", e)
	return []
	return images


	# =====================================================
	# EXTRACT ITEMS (Enhanced parsing)
	# =====================================================
	def extract_items(text):
	lines = [l.strip() for l in text.split("\n") if l.strip()]
	items = []

	# Common patterns - updated for better matching
	serial_regex = r"^\d+\s+" # Serial at start: "1 "
	hsn_regex = r"\b\d{6,8}\b" # 6-8 digit HSN
	amount_regex = r"\d{1,3}(?:,\d{3})*(?:\.\d{2})?" # Amounts with commas
	qty_regex = r"(\d+(?:\.\d{1,2})?)\s*(NOS\|PCS\|PC\|KG\|KILO\|LTR\|LITRE\|MTR\|METRE\|BOX\|SET\|UNIT\|EA\|EACH)"

	i = 0
	n = len(lines)

	while i < n:
	line = lines[i]

	# Check if this line starts with serial number followed by description
	serial_match = re.match(serial_regex, line)

	if serial_match:
	try:
	# Extract serial number
	serial = serial_match.group().strip()

	# Extract product description (rest of the line after serial)
	remaining = line[len(serial_match.group()):].strip()

	# Skip if this looks like table headers
	if re.search(r"Description\|Goods\|HSN\|Quantity\|Rate\|Amount", remaining, re.I):
	i += 1
	continue

	product_name = remaining

	# Initialize variables
	hsn = ""
	part_no = ""
	quantity = ""
	uom = ""
	rate_excl_tax = ""
	gst_percentage = ""
	gst_amount = ""
	discount_amount = ""
	discount_percentage = ""
	taxable_value = ""

	# Search in next 20 lines for related data
	search_end = min(i + 20, n)

	for k in range(i + 1, search_end):
	current_line = lines[k]

	# Stop if we hit next serial number
	if re.match(serial_regex, current_line):
	break

	# Stop if we hit total/subtotal
	if re.search(r"^(Total\|Sub\|CGST\|SGST\|IGST\|Round)", current_line, re.I):
	break

	# Extract HSN
	if not hsn:
	hsn_match = re.search(hsn_regex, current_line)
	if hsn_match:
	potential_hsn = hsn_match.group()
	# Make sure it's not a phone number or other number
	if not re.search(rf"\b{potential_hsn}\b.*\b{potential_hsn}\b", current_line):
	hsn = potential_hsn

	# Extract Part Number - look for patterns like "84408596-P"
	if not part_no:
	# Pattern: alphanumeric with optional -P suffix
	part_match = re.search(r"\b([A-Z0-9]{6,}(?:-[A-Z0-9]+)?)\b", current_line)
	if part_match:
	potential_part = part_match.group(1)
	# Not HSN, not phone, not date
	if (potential_part != hsn and
	len(potential_part) >= 6 and
	not re.match(r'^\d{10}$', potential_part)):
	part_no = potential_part

	# Extract Quantity and UOM
	if not quantity:
	qty_match = re.search(qty_regex, current_line, re.I)
	if qty_match:
	quantity = qty_match.group(1)
	uom = qty_match.group(2)

	# Extract GST Percentage - look for patterns like "13 %", "CGST@6%"
	if not gst_percentage:
	gst_match = re.search(r"(\d{1,2})\s*%", current_line)
	if gst_match and not re.search(r"Disc\|Discount", current_line, re.I):
	gst_pct = gst_match.group(1)
	# For CGST/SGST, double the percentage for total GST
	if re.search(r"CGST\|SGST", current_line, re.I):
	gst_percentage = str(int(gst_pct) * 2) + "%"
	else:
	gst_percentage = gst_pct + "%"

	# Extract Discount
	if not discount_amount and not discount_percentage:
	if re.search(r"Disc\|Discount", current_line, re.I):
	disc_match = re.search(r"(\d+(?:\.\d{2})?)\s*%", current_line)
	if disc_match:
	discount_percentage = disc_match.group(1) + "%"
	else:
	amount_match = re.search(amount_regex, current_line)
	if amount_match:
	discount_amount = amount_match.group()

	# Extract Rate (look for rate context)
	if not rate_excl_tax:
	if re.search(r"Rate(?!\s*%)", current_line, re.I):
	rate_matches = re.findall(amount_regex, current_line)
	if rate_matches:
	# Get the last amount (usually the rate)
	rate_excl_tax = rate_matches[-1].replace(',', '')

	# Extract Taxable Value
	if not taxable_value:
	if re.search(r"Taxable\|Value", current_line, re.I) and not re.search(r"Rate", current_line, re.I):
	tax_matches = re.findall(amount_regex, current_line)
	if tax_matches:
	taxable_value = tax_matches[-1].replace(',', '')

	# Extract GST Amount
	if not gst_amount:
	if re.search(r"(CGST\|SGST\|IGST).*Amount", current_line, re.I):
	gst_matches = re.findall(amount_regex, current_line)
	if gst_matches:
	# For CGST+SGST, we need to sum them
	gst_amount = gst_matches[-1].replace(',', '')

	# Calculate missing values
	if not rate_excl_tax and taxable_value and quantity:
	try:
	rate_excl_tax = str(round(float(taxable_value.replace(',', '')) / float(quantity), 2))
	except:
	pass

	if not taxable_value and rate_excl_tax and quantity:
	try:
	taxable_value = str(round(float(rate_excl_tax.replace(',', '')) * float(quantity), 2))
	except:
	pass

	# Clean up product name
	product_name = re.sub(r'\s+', ' ', product_name).strip()

	# Only add if we have meaningful data
	if product_name or hsn or quantity:
	items.append({
	"Serial": serial,
	"Product Name": product_name,
	"Part Number": part_no,
	"HSN/SAC": hsn,
	"Quantity": quantity,
	"UOM": uom,
	"Rate (Excl. Tax)": rate_excl_tax,
	"Taxable Value": taxable_value,
	"GST Percentage": gst_percentage,
	"GST Amount": gst_amount,
	"Discount Percentage": discount_percentage,
	"Discount Amount": discount_amount
	})

	i += 1

	except Exception as e:
	print(f"Item extraction error at line {i}: {e}")
	i += 1
	else:
	i += 1

	return items


	# =====================================================
	# FIELD EXTRACTION (Vendor, Buyer, GSTIN, Invoice)
	# =====================================================
	def extract_fields(text):

	data = {
	"Vendor Name": "",
	"Vendor GSTIN": "",
	"Vendor Contact": "",
	"Buyer Name": "",
	"Buyer GSTIN": "",
	"Buyer Contact": "",
	"Invoice Number": "",
	"Invoice Date": "",
	"Items": []
	}

	lines = [l.strip() for l in text.split("\n") if l.strip()]

	# Phone number pattern (Indian format)
	phone_regex = r"(?:(?:\+91\|0)?[\s-]?)?[6-9]\d{9}"

	# === VENDOR INFORMATION ===
	for i, l in enumerate(lines):
	if "TAX INVOICE" in l.upper() or "INVOICE" in l.upper():
	for j in range(i + 1, min(i + 5, len(lines))):
	candidate = lines[j]
	if (len(candidate) > 3 and
	not re.search(r"GSTIN\|GST\|PAN\|ADDRESS\|PHONE\|EMAIL\|^\d+$", candidate, re.I)):
	data["Vendor Name"] = candidate
	break
	break

	if not data["Vendor Name"]:
	for i in range(min(5, len(lines))):
	if len(lines[i]) > 3 and not re.search(r"INVOICE\|ORIGINAL\|DUPLICATE", lines[i], re.I):
	data["Vendor Name"] = lines[i]
	break

	# GSTIN extraction with context
	gst_regex = r"\b\d{2}[A-Z]{5}\d{4}[A-Z]\d[A-Z\d]{3}\b"

	# Find vendor GSTIN (appears first or near vendor section)
	for i, l in enumerate(lines[:len(lines)//2]):
	if "GSTIN" in l.upper() or "UIN" in l.upper():
	# Check same line and next few lines
	for j in range(i, min(i + 3, len(lines))):
	match = re.search(gst_regex, lines[j])
	if match and not data["Vendor GSTIN"]:
	data["Vendor GSTIN"] = match.group()
	break

	# Find buyer GSTIN (appears in buyer section)
	for i, l in enumerate(lines):
	if any(kw in l.upper() for kw in ["BUYER", "BILL TO"]):
	for j in range(i, min(i + 10, len(lines))):
	if "GSTIN" in lines[j].upper() or "UIN" in lines[j].upper():
	for k in range(j, min(j + 3, len(lines))):
	match = re.search(gst_regex, lines[k])
	if match and match.group() != data["Vendor GSTIN"]:
	data["Buyer GSTIN"] = match.group()
	break
	break
	break

	# Vendor Contact
	first_half = "\n".join(lines[:len(lines)//2])
	vendor_phones = re.findall(phone_regex, first_half)
	if vendor_phones:
	data["Vendor Contact"] = vendor_phones[0].strip()

	# === BUYER INFORMATION ===
	buyer_keywords = ["BUYER", "BILL TO", "BILLED TO", "CUSTOMER", "CONSIGNEE", "SHIP TO"]
	for i, l in enumerate(lines):
	if any(keyword in l.upper() for keyword in buyer_keywords):
	for j in range(i + 1, min(i + 10, len(lines))):
	candidate = lines[j]
	# Skip lines that contain common non-name patterns
	if (len(candidate) > 3 and
	not re.search(r"GSTIN\|GST\|STATE\|CODE\|ADDRESS\|PHONE\|GROUND\|FLOOR\|KHATA\|PLOT\|OPPOSITE\|UNIT\|DATED\|PLACE", candidate, re.I)):
	# Check if it looks like a company name (has letters and reasonable length)
	if re.search(r"[A-Z]", candidate) and len(candidate) < 50:
	data["Buyer Name"] = candidate
	break
	break

	# Buyer Contact
	middle_section = "\n".join(lines[len(lines)//4:3*len(lines)//4])
	buyer_phones = re.findall(phone_regex, middle_section)
	for phone in buyer_phones:
	if phone != data["Vendor Contact"]:
	data["Buyer Contact"] = phone.strip()
	break

	# === INVOICE NUMBER ===
	skip_words = ["KHATA", "PLOT", "POST", "LANE", "STATE", "DATED", "ORIGINAL", "DUPLICATE"]
	inv_keywords = ["INVOICE NO", "INVOICE NUMBER", "INV NO", "BILL NO", "BILL NUMBER"]

	for i, l in enumerate(lines):
	if any(keyword in l.upper() for keyword in inv_keywords):
	parts = l.split(":")
	if len(parts) > 1:
	inv_candidate = parts[1].strip()
	if inv_candidate and not any(sw in inv_candidate.upper() for sw in skip_words):
	data["Invoice Number"] = inv_candidate
	break

	for j in range(i + 1, min(i + 5, len(lines))):
	cand = lines[j]
	if not any(sw in cand.upper() for sw in skip_words):
	if re.search(r"[A-Z0-9]", cand) and len(cand) > 2:
	data["Invoice Number"] = cand
	break
	break

	# === INVOICE DATE ===
	date_patterns = [
	r"\b\d{1,2}-[A-Za-z]{3}-\d{2,4}\b", # 16-Aug-25
	r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b", # DD-MM-YYYY or DD/MM/YYYY
	r"\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b", # YYYY-MM-DD
	]

	date_keywords = ["DATE", "DATED", "INVOICE DATE", "BILL DATE"]

	for i, l in enumerate(lines):
	# Check if line contains date keyword
	if any(keyword in l.upper() for keyword in date_keywords):
	# First check the same line for date
	for pattern in date_patterns:
	m = re.search(pattern, l)
	if m:
	# Make sure it's not the invoice number
	potential_date = m.group(0)
	if not re.search(r"[A-Z]{2,}", potential_date) and '/' not in potential_date[:5]:
	data["Invoice Date"] = potential_date
	break

	if data["Invoice Date"]:
	break

	# Then check next few lines
	for j in range(i + 1, min(i + 4, len(lines))):
	for pattern in date_patterns:
	m = re.search(pattern, lines[j])
	if m:
	potential_date = m.group(0)
	# Validate it's not invoice number
	if not re.search(r"[A-Z]{2,}", potential_date) and len(potential_date) < 15:
	data["Invoice Date"] = potential_date
	break
	if data["Invoice Date"]:
	break
	break

	# Items
	data["Items"] = extract_items(text)

	return data


	# =====================================================
	# OCR MAIN FUNCTION
	# =====================================================
	def run_ocr(file_path):
	try:
	full_text = ""

	# PDF
	if file_path.lower().endswith(".pdf"):
	pages = pdf_to_images(file_path)
	for img in pages:
	arr = np.array(img)
	txt = reader.readtext(arr, detail=0)
	full_text += "\n".join(txt) + "\n"

	# Image
	else:
	img = PILImage.open(file_path).convert("RGB")
	arr = np.array(img)
	txt = reader.readtext(arr, detail=0)
	full_text = "\n".join(txt)

	fields = extract_fields(full_text)
	return full_text, fields

	except Exception as e:
	return f"Error processing file: {str(e)}", {}


	# =====================================================
	# FASTAPI APP WITH CORS
	# =====================================================
	app = FastAPI(title="Invoice OCR API", version="1.0")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	@app.get("/")
	async def root():
	return {
	"message": "Invoice OCR API",
	"endpoints": {
	"POST /api/extract": "Extract data from invoice (PDF/Image)",
	"GET /docs": "API Documentation"
	}
	}


	@app.post("/api/extract")
	async def extract_api(file: UploadFile = File(...)):
	try:
	allowed_types = ["application/pdf", "image/jpeg", "image/png", "image/jpg"]
	if file.content_type not in allowed_types:
	raise HTTPException(
	status_code=400,
	detail=f"Invalid file type. Allowed: PDF, JPEG, PNG"
	)

	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as tmp:
	content = await file.read()
	tmp.write(content)
	tmp_path = tmp.name

	full_text, fields = run_ocr(tmp_path)
	os.unlink(tmp_path)

	return JSONResponse({
	"success": True,
	"filename": file.filename,
	"text": full_text,
	"fields": fields
	})

	except HTTPException as he:
	raise he
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	# =====================================================
	# GRADIO FRONTEND
	# =====================================================
	def process_invoice(file):
	if file is None:
	return "No file uploaded", {}

	full_text, fields = run_ocr(file.name)
	return full_text, fields


	demo = gr.Interface(
	fn=process_invoice,
	inputs=gr.File(type="filepath", label="Upload Invoice (PDF/Image)"),
	outputs=[
	gr.Textbox(label="Extracted Text", lines=10),
	gr.JSON(label="Extracted Fields")
	],
	title="📄 Invoice OCR Extractor",
	description="Upload PDF or Image invoices to extract text and structured data using EasyOCR",
	examples=None,
	cache_examples=False
	)


	# =====================================================
	# MOUNT GRADIO ON FASTAPI
	# =====================================================
	app = gr.mount_gradio_app(app, demo, path="/")


	# =====================================================
	# LAUNCH (Hugging Face compatible)
	# =====================================================
	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)