Hemanth R commited on
Commit
89967fb
·
1 Parent(s): f5c94cf

Add model and API

Browse files
Files changed (2) hide show
  1. app.py +184 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request
2
+ import base64
3
+ from PIL import Image, ImageEnhance
4
+ import pytesseract
5
+ from langdetect import detect, DetectorFactory
6
+ from googletrans import Translator
7
+ import re
8
+ import numpy as np
9
+ import cv2
10
+ import unicodedata
11
+ import io
12
+
13
+ # Fix language detection randomness
14
+ DetectorFactory.seed = 0
15
+
16
+ app = FastAPI()
17
+
18
+ LANG_CODE_MAP = {
19
+ "en": "eng", "ta": "tam", "hi": "hin",
20
+ "kn": "kan", "ml": "mal", "te": "tel",
21
+ "bn": "ben", "gu": "guj", "pa": "pan", "mr": "mar"
22
+ }
23
+
24
+ # ------------------ CLEANING ------------------
25
+ def clean_ocr_text(text):
26
+ text = unicodedata.normalize("NFKC", text)
27
+ text = re.sub(r'\s+', ' ', text).strip()
28
+ replacements = {
29
+ r'\bI(?=\d)': '1',
30
+ r'(?<=\d)O\b': '0',
31
+ r'\bO(?=\d)': '0',
32
+ r'(?<=\d)l\b': '1',
33
+ r'\bS(?=\d)': '5',
34
+ r'\bBi\s*11\b': 'Bill',
35
+ }
36
+ for pattern, replacement in replacements.items():
37
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
38
+ text = text.replace(" .", ".").replace(" ,", ",")
39
+ text = re.sub(r'\s+:\s*', ': ', text)
40
+ text = re.sub(r'\s+#\s*', ' #', text)
41
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
42
+ return text
43
+
44
+ # ------------------ PREPROCESSING ------------------
45
+ def preprocess_image(image):
46
+ if not isinstance(image, np.ndarray):
47
+ image = np.array(image)
48
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
49
+ gray = cv2.medianBlur(gray, 3)
50
+ kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
51
+ gray = cv2.filter2D(gray, -1, kernel)
52
+ pil_img = Image.fromarray(gray)
53
+ enhancer = ImageEnhance.Contrast(pil_img)
54
+ pil_img = enhancer.enhance(2)
55
+ return np.array(pil_img)
56
+
57
+ # ------------------ OCR ------------------
58
+ def perform_ocr(image):
59
+ text = pytesseract.image_to_string(
60
+ image,
61
+ lang='eng+tam+kan+hin+tel+mal+ben+guj+pan+mar',
62
+ config='--psm 6'
63
+ ).strip()
64
+ detected_lang = detect(text) if text else "en"
65
+ translated_text = None
66
+ if detected_lang != 'en' and text:
67
+ translator = Translator()
68
+ translated_text = translator.translate(text, src=detected_lang, dest='en').text
69
+ return {
70
+ "detected_language": detected_lang,
71
+ "original_text": text,
72
+ "translated_text": translated_text
73
+ }
74
+
75
+ # ------------------ FIELD EXTRACTION ------------------
76
+ def extract_field_from_lines(lines, patterns):
77
+ for line in lines:
78
+ for pattern in patterns:
79
+ match = re.search(pattern, line, flags=re.IGNORECASE)
80
+ if match:
81
+ return match.group(1).strip() if match.lastindex else match.group(0).strip()
82
+ return None
83
+
84
+ def extract_invoice_fields(text):
85
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
86
+ invoice_number_patterns = [
87
+ # Tax Invoice with number explicitly mentioned
88
+ r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/]{4,})',
89
+
90
+ r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)(?!date)([A-Z0-9][A-Z0-9\-_/]{4,})',
91
+
92
+
93
+
94
+ # Generic Invoice No. / Invoice #
95
+ r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
96
+
97
+ # Receipt patterns
98
+ r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
99
+
100
+ # Generic # prefix
101
+ r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
102
+
103
+ # Order after Receipt
104
+ r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})'
105
+ ]
106
+
107
+
108
+
109
+ # Context-aware patterns first (with "date" keywords)
110
+ date_patterns = [
111
+ r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
112
+ r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
113
+ r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
114
+ r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
115
+ r'(?:receipt\s*date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
116
+ ]
117
+
118
+ # Fallback patterns (no keywords, match only if above fail)
119
+ fallback_date_patterns = [
120
+ r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
121
+ r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
122
+ r'\b([A-Za-z]{3,9}\s*\d{1,2},?\s*\d{4})\b',
123
+ r'\b(\d{4}[/-]\d{1,2}[/-]\d{1,2})\b',
124
+ r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
125
+ ]
126
+
127
+
128
+
129
+
130
+ amount_patterns = [
131
+ r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
132
+ # r'Total\s+Sales\s*\(Inclusive\s+GST\)\s*[A-Za-z]*\s*([\d,.]+)'
133
+ #r'[\s:](\d{3,6}\.\d{2})[\s]*$',
134
+ #r'(?i)(?:total\s*(?:value|due)?|invoice\s*value)\s*[:\-]?\s*(?:₹|Rs\.?|INR)?\s*([\d,.]+)', # Added this pattern
135
+ r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b', # Added this pattern
136
+ #r'(?i)(total\s*(amount|value|due)?|invoice\s*value|grand\s*total)[:\-]?\s*(₹|Rs\.?|INR)?\s*([\d,.]+)',
137
+ r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b'
138
+
139
+ ]
140
+
141
+ invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
142
+ invoice_date = extract_field_from_lines(lines, date_patterns) or extract_field_from_lines(lines, fallback_date_patterns)
143
+ total_amount = extract_field_from_lines(lines, amount_patterns)
144
+ if not total_amount:
145
+ numbers = []
146
+ for line in lines:
147
+ matches = re.findall(r'\d{1,3}(?:,\d{3})*(?:\.\d{2})', line)
148
+ numbers += [float(m.replace(',', '')) for m in matches if m]
149
+ if numbers:
150
+ total_amount = f"{max(numbers):.2f}"
151
+ return {
152
+ "invoice_number": invoice_number,
153
+ "invoice_date": invoice_date,
154
+ "total_amount": total_amount
155
+ }
156
+
157
+ # ------------------ API ENDPOINT ------------------
158
+ @app.post("/predict")
159
+ async def predict(request: Request):
160
+ data = await request.json()
161
+ img_base64 = data.get("image")
162
+ if not img_base64:
163
+ return {"error": "No image provided"}
164
+
165
+ image_data = base64.b64decode(img_base64)
166
+ image = Image.open(io.BytesIO(image_data))
167
+
168
+ # Preprocess
169
+ processed_img = preprocess_image(image)
170
+
171
+ # OCR + Translation
172
+ text_data = perform_ocr(processed_img)
173
+
174
+ # Cleaning
175
+ cleaned_text = clean_ocr_text(text_data["translated_text"] or text_data["original_text"])
176
+
177
+ # Extraction
178
+ fields = extract_invoice_fields(cleaned_text)
179
+
180
+ return {
181
+ "language": text_data["detected_language"],
182
+ "text": cleaned_text,
183
+ "fields": fields
184
+ }
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pillow
4
+ pytesseract
5
+ googletrans==4.0.0-rc1
6
+ langdetect
7
+ opencv-python
8
+ numpy