Spaces:
Runtime error
Runtime error
Hemanth R commited on
Commit ·
89967fb
1
Parent(s): f5c94cf
Add model and API
Browse files- app.py +184 -0
- requirements.txt +8 -0
app.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Request
|
| 2 |
+
import base64
|
| 3 |
+
from PIL import Image, ImageEnhance
|
| 4 |
+
import pytesseract
|
| 5 |
+
from langdetect import detect, DetectorFactory
|
| 6 |
+
from googletrans import Translator
|
| 7 |
+
import re
|
| 8 |
+
import numpy as np
|
| 9 |
+
import cv2
|
| 10 |
+
import unicodedata
|
| 11 |
+
import io
|
| 12 |
+
|
| 13 |
+
# Fix language detection randomness
|
| 14 |
+
DetectorFactory.seed = 0
|
| 15 |
+
|
| 16 |
+
app = FastAPI()
|
| 17 |
+
|
| 18 |
+
LANG_CODE_MAP = {
|
| 19 |
+
"en": "eng", "ta": "tam", "hi": "hin",
|
| 20 |
+
"kn": "kan", "ml": "mal", "te": "tel",
|
| 21 |
+
"bn": "ben", "gu": "guj", "pa": "pan", "mr": "mar"
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
# ------------------ CLEANING ------------------
|
| 25 |
+
def clean_ocr_text(text):
|
| 26 |
+
text = unicodedata.normalize("NFKC", text)
|
| 27 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 28 |
+
replacements = {
|
| 29 |
+
r'\bI(?=\d)': '1',
|
| 30 |
+
r'(?<=\d)O\b': '0',
|
| 31 |
+
r'\bO(?=\d)': '0',
|
| 32 |
+
r'(?<=\d)l\b': '1',
|
| 33 |
+
r'\bS(?=\d)': '5',
|
| 34 |
+
r'\bBi\s*11\b': 'Bill',
|
| 35 |
+
}
|
| 36 |
+
for pattern, replacement in replacements.items():
|
| 37 |
+
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
|
| 38 |
+
text = text.replace(" .", ".").replace(" ,", ",")
|
| 39 |
+
text = re.sub(r'\s+:\s*', ': ', text)
|
| 40 |
+
text = re.sub(r'\s+#\s*', ' #', text)
|
| 41 |
+
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
| 42 |
+
return text
|
| 43 |
+
|
| 44 |
+
# ------------------ PREPROCESSING ------------------
|
| 45 |
+
def preprocess_image(image):
|
| 46 |
+
if not isinstance(image, np.ndarray):
|
| 47 |
+
image = np.array(image)
|
| 48 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 49 |
+
gray = cv2.medianBlur(gray, 3)
|
| 50 |
+
kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
|
| 51 |
+
gray = cv2.filter2D(gray, -1, kernel)
|
| 52 |
+
pil_img = Image.fromarray(gray)
|
| 53 |
+
enhancer = ImageEnhance.Contrast(pil_img)
|
| 54 |
+
pil_img = enhancer.enhance(2)
|
| 55 |
+
return np.array(pil_img)
|
| 56 |
+
|
| 57 |
+
# ------------------ OCR ------------------
|
| 58 |
+
def perform_ocr(image):
|
| 59 |
+
text = pytesseract.image_to_string(
|
| 60 |
+
image,
|
| 61 |
+
lang='eng+tam+kan+hin+tel+mal+ben+guj+pan+mar',
|
| 62 |
+
config='--psm 6'
|
| 63 |
+
).strip()
|
| 64 |
+
detected_lang = detect(text) if text else "en"
|
| 65 |
+
translated_text = None
|
| 66 |
+
if detected_lang != 'en' and text:
|
| 67 |
+
translator = Translator()
|
| 68 |
+
translated_text = translator.translate(text, src=detected_lang, dest='en').text
|
| 69 |
+
return {
|
| 70 |
+
"detected_language": detected_lang,
|
| 71 |
+
"original_text": text,
|
| 72 |
+
"translated_text": translated_text
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
# ------------------ FIELD EXTRACTION ------------------
|
| 76 |
+
def extract_field_from_lines(lines, patterns):
|
| 77 |
+
for line in lines:
|
| 78 |
+
for pattern in patterns:
|
| 79 |
+
match = re.search(pattern, line, flags=re.IGNORECASE)
|
| 80 |
+
if match:
|
| 81 |
+
return match.group(1).strip() if match.lastindex else match.group(0).strip()
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
def extract_invoice_fields(text):
|
| 85 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 86 |
+
invoice_number_patterns = [
|
| 87 |
+
# Tax Invoice with number explicitly mentioned
|
| 88 |
+
r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/]{4,})',
|
| 89 |
+
|
| 90 |
+
r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)(?!date)([A-Z0-9][A-Z0-9\-_/]{4,})',
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# Generic Invoice No. / Invoice #
|
| 95 |
+
r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
|
| 96 |
+
|
| 97 |
+
# Receipt patterns
|
| 98 |
+
r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
|
| 99 |
+
|
| 100 |
+
# Generic # prefix
|
| 101 |
+
r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
|
| 102 |
+
|
| 103 |
+
# Order after Receipt
|
| 104 |
+
r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})'
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# Context-aware patterns first (with "date" keywords)
|
| 110 |
+
date_patterns = [
|
| 111 |
+
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
|
| 112 |
+
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
|
| 113 |
+
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
|
| 114 |
+
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 115 |
+
r'(?:receipt\s*date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
# Fallback patterns (no keywords, match only if above fail)
|
| 119 |
+
fallback_date_patterns = [
|
| 120 |
+
r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
|
| 121 |
+
r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
|
| 122 |
+
r'\b([A-Za-z]{3,9}\s*\d{1,2},?\s*\d{4})\b',
|
| 123 |
+
r'\b(\d{4}[/-]\d{1,2}[/-]\d{1,2})\b',
|
| 124 |
+
r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
amount_patterns = [
|
| 131 |
+
r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
|
| 132 |
+
# r'Total\s+Sales\s*\(Inclusive\s+GST\)\s*[A-Za-z]*\s*([\d,.]+)'
|
| 133 |
+
#r'[\s:](\d{3,6}\.\d{2})[\s]*$',
|
| 134 |
+
#r'(?i)(?:total\s*(?:value|due)?|invoice\s*value)\s*[:\-]?\s*(?:₹|Rs\.?|INR)?\s*([\d,.]+)', # Added this pattern
|
| 135 |
+
r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b', # Added this pattern
|
| 136 |
+
#r'(?i)(total\s*(amount|value|due)?|invoice\s*value|grand\s*total)[:\-]?\s*(₹|Rs\.?|INR)?\s*([\d,.]+)',
|
| 137 |
+
r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b'
|
| 138 |
+
|
| 139 |
+
]
|
| 140 |
+
|
| 141 |
+
invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
|
| 142 |
+
invoice_date = extract_field_from_lines(lines, date_patterns) or extract_field_from_lines(lines, fallback_date_patterns)
|
| 143 |
+
total_amount = extract_field_from_lines(lines, amount_patterns)
|
| 144 |
+
if not total_amount:
|
| 145 |
+
numbers = []
|
| 146 |
+
for line in lines:
|
| 147 |
+
matches = re.findall(r'\d{1,3}(?:,\d{3})*(?:\.\d{2})', line)
|
| 148 |
+
numbers += [float(m.replace(',', '')) for m in matches if m]
|
| 149 |
+
if numbers:
|
| 150 |
+
total_amount = f"{max(numbers):.2f}"
|
| 151 |
+
return {
|
| 152 |
+
"invoice_number": invoice_number,
|
| 153 |
+
"invoice_date": invoice_date,
|
| 154 |
+
"total_amount": total_amount
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# ------------------ API ENDPOINT ------------------
|
| 158 |
+
@app.post("/predict")
|
| 159 |
+
async def predict(request: Request):
|
| 160 |
+
data = await request.json()
|
| 161 |
+
img_base64 = data.get("image")
|
| 162 |
+
if not img_base64:
|
| 163 |
+
return {"error": "No image provided"}
|
| 164 |
+
|
| 165 |
+
image_data = base64.b64decode(img_base64)
|
| 166 |
+
image = Image.open(io.BytesIO(image_data))
|
| 167 |
+
|
| 168 |
+
# Preprocess
|
| 169 |
+
processed_img = preprocess_image(image)
|
| 170 |
+
|
| 171 |
+
# OCR + Translation
|
| 172 |
+
text_data = perform_ocr(processed_img)
|
| 173 |
+
|
| 174 |
+
# Cleaning
|
| 175 |
+
cleaned_text = clean_ocr_text(text_data["translated_text"] or text_data["original_text"])
|
| 176 |
+
|
| 177 |
+
# Extraction
|
| 178 |
+
fields = extract_invoice_fields(cleaned_text)
|
| 179 |
+
|
| 180 |
+
return {
|
| 181 |
+
"language": text_data["detected_language"],
|
| 182 |
+
"text": cleaned_text,
|
| 183 |
+
"fields": fields
|
| 184 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
pillow
|
| 4 |
+
pytesseract
|
| 5 |
+
googletrans==4.0.0-rc1
|
| 6 |
+
langdetect
|
| 7 |
+
opencv-python
|
| 8 |
+
numpy
|