HemanthR007 commited on
Commit
19c7eb6
·
verified ·
1 Parent(s): cde453e

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +41 -63
main.py CHANGED
@@ -1,21 +1,16 @@
1
- from fastapi import FastAPI, Request
2
  import base64
3
  from PIL import Image, ImageEnhance
4
  import pytesseract
5
  from langdetect import detect, DetectorFactory
6
- #from googletrans import Translator
7
  from deep_translator import GoogleTranslator
8
  import re
9
  import numpy as np
10
  import cv2
11
  import unicodedata
12
  import io
13
- import uvicorn
14
- from fastapi import UploadFile, File
15
  from pydantic import BaseModel
16
 
17
-
18
-
19
  # Fix language detection randomness
20
  DetectorFactory.seed = 0
21
 
@@ -93,38 +88,20 @@ def extract_field_from_lines(lines, patterns):
93
  def extract_invoice_fields(text):
94
  lines = [line.strip() for line in text.split('\n') if line.strip()]
95
  invoice_number_patterns = [
96
- # Tax Invoice with number explicitly mentioned
97
- r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/]{4,})',
98
-
99
- r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)(?!date)([A-Z0-9][A-Z0-9\-_/]{4,})',
100
-
101
-
102
-
103
- # Generic Invoice No. / Invoice #
104
- r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
105
-
106
- # Receipt patterns
107
- r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
108
-
109
- # Generic # prefix
110
- r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
111
-
112
- # Order after Receipt
113
- r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})'
114
- ]
115
-
116
-
117
 
118
- # Context-aware patterns first (with "date" keywords)
119
  date_patterns = [
120
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
121
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
122
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
123
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
124
- r'(?:receipt\s*date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
125
  ]
126
 
127
- # Fallback patterns (no keywords, match only if above fail)
128
  fallback_date_patterns = [
129
  r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
130
  r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
@@ -133,23 +110,15 @@ def extract_invoice_fields(text):
133
  r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
134
  ]
135
 
136
-
137
-
138
-
139
  amount_patterns = [
140
  r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
141
- # r'Total\s+Sales\s*\(Inclusive\s+GST\)\s*[A-Za-z]*\s*([\d,.]+)'
142
- #r'[\s:](\d{3,6}\.\d{2})[\s]*$',
143
- #r'(?i)(?:total\s*(?:value|due)?|invoice\s*value)\s*[:\-]?\s*(?:₹|Rs\.?|INR)?\s*([\d,.]+)', # Added this pattern
144
- r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b', # Added this pattern
145
- #r'(?i)(total\s*(amount|value|due)?|invoice\s*value|grand\s*total)[:\-]?\s*(₹|Rs\.?|INR)?\s*([\d,.]+)',
146
- r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b'
147
-
148
  ]
149
 
150
  invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
151
  invoice_date = extract_field_from_lines(lines, date_patterns) or extract_field_from_lines(lines, fallback_date_patterns)
152
  total_amount = extract_field_from_lines(lines, amount_patterns)
 
153
  if not total_amount:
154
  numbers = []
155
  for line in lines:
@@ -157,44 +126,53 @@ def extract_invoice_fields(text):
157
  numbers += [float(m.replace(',', '')) for m in matches if m]
158
  if numbers:
159
  total_amount = f"{max(numbers):.2f}"
 
160
  return {
161
  "invoice_number": invoice_number,
162
  "invoice_date": invoice_date,
163
  "total_amount": total_amount
164
  }
165
 
166
- # ------------------ API ENDPOINT ------------------
167
  class ImagePayload(BaseModel):
168
  image: str
169
 
 
 
 
 
170
  @app.post("/predict")
171
  async def predict(payload: ImagePayload):
172
- img_base64 = payload.image
173
- if not img_base64:
174
- return {"error": "No image provided"}
 
175
 
176
- if img_base64.startswith("data:image"):
177
- img_base64 = img_base64.split(",")[1]
178
- # Preprocess
179
- processed_img = preprocess_image(image)
180
 
181
- # OCR + Translation
182
- text_data = perform_ocr(processed_img)
 
183
 
184
- # Cleaning
185
- cleaned_text = clean_ocr_text(text_data["translated_text"] or text_data["original_text"])
186
 
187
- # Extraction
188
- fields = extract_invoice_fields(cleaned_text)
189
 
190
- return {
191
- "language": text_data["detected_language"],
192
- "text": cleaned_text,
193
- "fields": fields
194
- }
195
 
 
 
 
 
 
196
 
197
- if __name__ == "__main__":
198
- import os
199
- port = int(os.environ.get("PORT", 8080))
200
- #uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
 
1
+ from fastapi import FastAPI
2
  import base64
3
  from PIL import Image, ImageEnhance
4
  import pytesseract
5
  from langdetect import detect, DetectorFactory
 
6
  from deep_translator import GoogleTranslator
7
  import re
8
  import numpy as np
9
  import cv2
10
  import unicodedata
11
  import io
 
 
12
  from pydantic import BaseModel
13
 
 
 
14
  # Fix language detection randomness
15
  DetectorFactory.seed = 0
16
 
 
88
  def extract_invoice_fields(text):
89
  lines = [line.strip() for line in text.split('\n') if line.strip()]
90
  invoice_number_patterns = [
91
+ r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/]{4,})',
92
+ r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
93
+ r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
94
+ r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
95
+ r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})'
96
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
 
98
  date_patterns = [
99
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
100
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
101
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
102
  r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
 
103
  ]
104
 
 
105
  fallback_date_patterns = [
106
  r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
107
  r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
 
110
  r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
111
  ]
112
 
 
 
 
113
  amount_patterns = [
114
  r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
115
+ r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b',
 
 
 
 
 
 
116
  ]
117
 
118
  invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
119
  invoice_date = extract_field_from_lines(lines, date_patterns) or extract_field_from_lines(lines, fallback_date_patterns)
120
  total_amount = extract_field_from_lines(lines, amount_patterns)
121
+
122
  if not total_amount:
123
  numbers = []
124
  for line in lines:
 
126
  numbers += [float(m.replace(',', '')) for m in matches if m]
127
  if numbers:
128
  total_amount = f"{max(numbers):.2f}"
129
+
130
  return {
131
  "invoice_number": invoice_number,
132
  "invoice_date": invoice_date,
133
  "total_amount": total_amount
134
  }
135
 
136
+ # ------------------ API ENDPOINTS ------------------
137
  class ImagePayload(BaseModel):
138
  image: str
139
 
140
+ @app.get("/")
141
+ def read_root():
142
+ return {"status": "ok", "message": "Invoice OCR API is running!"}
143
+
144
  @app.post("/predict")
145
  async def predict(payload: ImagePayload):
146
+ try:
147
+ img_base64 = payload.image
148
+ if not img_base64:
149
+ return {"error": "No image provided"}
150
 
151
+ # Remove base64 prefix if present
152
+ if img_base64.startswith("data:image"):
153
+ img_base64 = img_base64.split(",")[1]
 
154
 
155
+ # Decode base64 to image
156
+ image_bytes = base64.b64decode(img_base64)
157
+ image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
158
 
159
+ # Preprocess
160
+ processed_img = preprocess_image(image)
161
 
162
+ # OCR + Translation
163
+ text_data = perform_ocr(processed_img)
164
 
165
+ # Cleaning
166
+ cleaned_text = clean_ocr_text(text_data["translated_text"] or text_data["original_text"])
167
+
168
+ # Extraction
169
+ fields = extract_invoice_fields(cleaned_text)
170
 
171
+ return {
172
+ "language": text_data["detected_language"],
173
+ "text": cleaned_text,
174
+ "fields": fields
175
+ }
176
 
177
+ except Exception as e:
178
+ return {"error": str(e)}