HemanthR007 commited on
Commit
3e86386
·
verified ·
1 Parent(s): e4bcc25

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +177 -51
main.py CHANGED
@@ -1,110 +1,225 @@
1
- from fastapi import FastAPI
2
- import base64
3
- from PIL import Image, ImageEnhance
4
  import pytesseract
5
  from langdetect import detect, DetectorFactory
6
- from deep_translator import GoogleTranslator
7
  import re
 
 
8
  import numpy as np
9
  import cv2
 
10
  import unicodedata
11
- import io
12
- from pydantic import BaseModel
13
-
14
- pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
15
 
16
  # Fix language detection randomness
17
  DetectorFactory.seed = 0
18
 
19
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  LANG_CODE_MAP = {
22
- "en": "eng", "ta": "tam", "hi": "hin",
23
- "kn": "kan", "ml": "mal", "te": "tel",
24
- "bn": "ben", "gu": "guj", "pa": "pan", "mr": "mar"
 
 
 
25
  }
26
 
27
- # ------------------ CLEANING ------------------
 
28
  def clean_ocr_text(text):
 
29
  text = unicodedata.normalize("NFKC", text)
 
 
30
  text = re.sub(r'\s+', ' ', text).strip()
 
 
31
  replacements = {
32
- r'\bI(?=\d)': '1',
33
- r'(?<=\d)O\b': '0',
34
- r'\bO(?=\d)': '0',
35
- r'(?<=\d)l\b': '1',
36
- r'\bS(?=\d)': '5',
37
- r'\bBi\s*11\b': 'Bill',
38
  }
39
  for pattern, replacement in replacements.items():
40
  text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
 
 
41
  text = text.replace(" .", ".").replace(" ,", ",")
42
  text = re.sub(r'\s+:\s*', ': ', text)
43
  text = re.sub(r'\s+#\s*', ' #', text)
 
 
44
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
 
45
  return text
46
 
47
- # ------------------ PREPROCESSING ------------------
 
 
 
 
 
 
 
 
48
  def preprocess_image(image):
 
 
 
49
  if not isinstance(image, np.ndarray):
50
  image = np.array(image)
 
51
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
 
52
  gray = cv2.medianBlur(gray, 3)
53
  kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
54
  gray = cv2.filter2D(gray, -1, kernel)
 
 
55
  pil_img = Image.fromarray(gray)
56
  enhancer = ImageEnhance.Contrast(pil_img)
57
  pil_img = enhancer.enhance(2)
 
58
  return np.array(pil_img)
59
 
60
- # ------------------ OCR ------------------
61
- def perform_ocr(image):
62
- text = pytesseract.image_to_string(
63
- image,
64
- lang='eng+tam+kan+hin+tel+mal+ben+guj+pan+mar',
65
- config='--psm 6'
66
- ).strip()
67
- detected_lang = detect(text) if text else "en"
68
- translated_text = None
69
- if detected_lang != 'en' and text:
70
- try:
71
- translated_text = GoogleTranslator(source=detected_lang, target="en").translate(text)
72
- except Exception as e:
73
- translated_text = f"[Translation failed: {e}]"
74
 
75
- return {
76
- "detected_language": detected_lang,
77
- "original_text": text,
78
- "translated_text": translated_text
79
- }
80
 
81
- # ------------------ FIELD EXTRACTION ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def extract_field_from_lines(lines, patterns):
83
  for line in lines:
84
  for pattern in patterns:
85
  match = re.search(pattern, line, flags=re.IGNORECASE)
86
  if match:
87
- return match.group(1).strip() if match.lastindex else match.group(0).strip()
 
 
 
 
 
 
88
  return None
89
 
 
90
  def extract_invoice_fields(text):
91
  lines = [line.strip() for line in text.split('\n') if line.strip()]
 
 
92
  invoice_number_patterns = [
93
- r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/]{4,})',
94
- r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)(?!date)([A-Z0-9][A-Z0-9\-_/]{4,})',
95
- r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
96
- r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
97
- r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
98
- r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
99
- ]
 
 
 
100
 
101
  date_patterns = [
102
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
103
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
104
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
105
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
 
106
  ]
107
-
108
  fallback_date_patterns = [
109
  r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
110
  r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
@@ -113,15 +228,25 @@ def extract_invoice_fields(text):
113
  r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
114
  ]
115
 
 
 
 
116
  amount_patterns = [
117
  r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
118
- r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b',
 
 
 
119
  ]
120
 
121
  invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
122
- invoice_date = extract_field_from_lines(lines, date_patterns) or extract_field_from_lines(lines, fallback_date_patterns)
123
  total_amount = extract_field_from_lines(lines, amount_patterns)
124
 
 
 
 
 
125
  if not total_amount:
126
  numbers = []
127
  for line in lines:
@@ -136,6 +261,7 @@ def extract_invoice_fields(text):
136
  "total_amount": total_amount
137
  }
138
 
 
139
  # ------------------ API ENDPOINTS ------------------
140
  class ImagePayload(BaseModel):
141
  image: str
 
1
+ # For Google Colab
2
+ from google.colab import files
3
+ from PIL import Image
4
  import pytesseract
5
  from langdetect import detect, DetectorFactory
6
+ from googletrans import Translator
7
  import re
8
+ import io
9
+ from pprint import pprint
10
  import numpy as np
11
  import cv2
12
+ from PIL import Image, ImageEnhance, ImageFilter
13
  import unicodedata
 
 
 
 
14
 
15
  # Fix language detection randomness
16
  DetectorFactory.seed = 0
17
 
18
+ def upload_and_process_image():
19
+ print(" Please upload an invoice image:")
20
+ uploaded = files.upload()
21
+ if not uploaded:
22
+ print("No file uploaded.")
23
+ return None
24
+ image_path = list(uploaded.keys())[0]
25
+ try:
26
+ image = Image.open(image_path)
27
+ return image
28
+ except Exception as e:
29
+ print("Error loading image:", e)
30
+ return None
31
+
32
+ def perform_ocr(image):
33
+ try:
34
+ text = pytesseract.image_to_string(
35
+ image,
36
+ lang='eng+tam+kan+hin+tel+mal+ben+guj+pan+mar',
37
+ config='--psm 6'
38
+ )
39
+ return text.strip()
40
+ except Exception as e:
41
+ print("OCR Error:", e)
42
+ return None
43
+
44
+ def perform_ocr(image):
45
+ try:
46
+ # First OCR pass (default settings)
47
+ text = pytesseract.image_to_string(image, config='--psm 6').strip()
48
+
49
+ # Detect language
50
+ detected_lang = detect(text)
51
+
52
+ # If not English, re-run OCR for better accuracy
53
+ if detected_lang != 'en':
54
+ text = pytesseract.image_to_string(
55
+ image,
56
+ lang=detected_lang,
57
+ config='--psm 6'
58
+ ).strip()
59
+
60
+ # Translate if needed
61
+ translated_text = text
62
+ if detected_lang != 'en':
63
+ translator = Translator()
64
+ translated_text = translator.translate(text, src=detected_lang, dest='en').text
65
+
66
+ return {
67
+ "detected_language": detected_lang,
68
+ "original_text": text,
69
+ "translated_text": translated_text if detected_lang != 'en' else None
70
+ }
71
+ except Exception as e:
72
+ print("OCR Error:", e)
73
+ return None
74
+
75
 
76
  LANG_CODE_MAP = {
77
+ "en": "eng",
78
+ "ta": "tam",
79
+ "hi": "hin",
80
+ "kn": "kan",
81
+ "ml": "mal",
82
+ "te": "tel",
83
  }
84
 
85
+
86
+
87
  def clean_ocr_text(text):
88
+ # Normalize unicode (fix weird diacritics, spacing issues)
89
  text = unicodedata.normalize("NFKC", text)
90
+
91
+ # Remove excessive spaces & fix newlines
92
  text = re.sub(r'\s+', ' ', text).strip()
93
+
94
+ # Common OCR letter/number confusion corrections (global)
95
  replacements = {
96
+ r'\bI(?=\d)': '1', # I before a digit → 1
97
+ r'(?<=\d)O\b': '0', # O after a digit → 0
98
+ r'\bO(?=\d)': '0', # O before a digit → 0
99
+ r'(?<=\d)l\b': '1', # l after digit → 1
100
+ r'\bS(?=\d)': '5', # S before digit → 5
101
+ r'\bBi\s*11\b': 'Bill', # Specific common OCR error
102
  }
103
  for pattern, replacement in replacements.items():
104
  text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
105
+
106
+ # Fix common punctuation errors
107
  text = text.replace(" .", ".").replace(" ,", ",")
108
  text = re.sub(r'\s+:\s*', ': ', text)
109
  text = re.sub(r'\s+#\s*', ' #', text)
110
+
111
+ # Remove weird OCR garbage characters
112
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
113
+
114
  return text
115
 
116
+
117
+ def preprocess_image(image):
118
+ """Convert to grayscale, remove noise, and improve text clarity."""
119
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
120
+ gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
121
+ gray = cv2.medianBlur(gray, 3)
122
+ return gray
123
+
124
+
125
  def preprocess_image(image):
126
+ if image is None: # Check if image is None
127
+ print("Error: Input image is None.")
128
+ return None
129
  if not isinstance(image, np.ndarray):
130
  image = np.array(image)
131
+
132
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
133
+
134
+ # Denoise + sharpen
135
  gray = cv2.medianBlur(gray, 3)
136
  kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
137
  gray = cv2.filter2D(gray, -1, kernel)
138
+
139
+ # Increase contrast
140
  pil_img = Image.fromarray(gray)
141
  enhancer = ImageEnhance.Contrast(pil_img)
142
  pil_img = enhancer.enhance(2)
143
+
144
  return np.array(pil_img)
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
 
 
 
 
 
147
 
148
+ def detect_language(text_data):
149
+ """Detect the language of extracted text"""
150
+ try:
151
+ lang_code = detect(text_data['original_text'])
152
+
153
+ language_map = {
154
+ 'en': 'English',
155
+ 'hi': 'Hindi',
156
+ 'ta': 'Tamil',
157
+ 'te': 'Telugu',
158
+ 'kn': 'Kannada'
159
+ }
160
+
161
+ detected_lang = language_map.get(lang_code, lang_code)
162
+ print(f"\nDetected Language: {detected_lang} ({lang_code})")
163
+ return lang_code
164
+ except Exception as e:
165
+ print(f"Language detection error: {e}")
166
+ return None
167
+
168
+
169
+ def translate_text(text_data, src_lang):
170
+ """Translate text to English if needed"""
171
+ if src_lang == 'en':
172
+ print("\nText is already in English, no translation needed.")
173
+ return text_data['original_text']
174
+
175
+ try:
176
+ translator = Translator()
177
+ translation = translator.translate(text_data['original_text'], src=src_lang, dest='en')
178
+ print("\nTranslation to English completed.")
179
+ return translation.text
180
+ except Exception as e:
181
+ print(f"Translation error: {e}")
182
+ return text_data['original_text']
183
+
184
+
185
  def extract_field_from_lines(lines, patterns):
186
  for line in lines:
187
  for pattern in patterns:
188
  match = re.search(pattern, line, flags=re.IGNORECASE)
189
  if match:
190
+ # Check if the pattern has capturing groups
191
+ if match.groups():
192
+ #return match.group(1).strip()
193
+ return match.group(1).strip() if match.lastindex else match.group(0).strip()
194
+ # else:
195
+ # # If no capturing group, return the entire match
196
+ # return match.group(0).strip()
197
  return None
198
 
199
+
200
  def extract_invoice_fields(text):
201
  lines = [line.strip() for line in text.split('\n') if line.strip()]
202
+
203
+
204
  invoice_number_patterns = [
205
+ r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/]{4,})',
206
+
207
+ r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)(?!date)([A-Z0-9][A-Z0-9\-_/]{4,})',
208
+ r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
209
+
210
+ r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
211
+ r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
212
+ r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})'
213
+ ]
214
+
215
 
216
  date_patterns = [
217
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
218
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
219
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
220
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
221
+ r'(?:receipt\s*date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
222
  ]
 
223
  fallback_date_patterns = [
224
  r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
225
  r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
 
228
  r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
229
  ]
230
 
231
+
232
+
233
+
234
  amount_patterns = [
235
  r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
236
+ #r'(?i)(?:total\s*(?:value|due)?|invoice\s*value)\s*[:\-]?\s*(?:₹|Rs\.?|INR)?\s*([\d,.]+)', # Added this pattern
237
+ r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b',
238
+ r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b'
239
+
240
  ]
241
 
242
  invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
243
+ invoice_date = extract_field_from_lines(lines, date_patterns)
244
  total_amount = extract_field_from_lines(lines, amount_patterns)
245
 
246
+ if not invoice_date:
247
+ invoice_date = extract_field_from_lines(lines, fallback_date_patterns)
248
+
249
+ # Fallback: largest number in OCR
250
  if not total_amount:
251
  numbers = []
252
  for line in lines:
 
261
  "total_amount": total_amount
262
  }
263
 
264
+
265
  # ------------------ API ENDPOINTS ------------------
266
  class ImagePayload(BaseModel):
267
  image: str