Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -1,21 +1,16 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
| 2 |
import base64
|
| 3 |
from PIL import Image, ImageEnhance
|
| 4 |
import pytesseract
|
| 5 |
from langdetect import detect, DetectorFactory
|
| 6 |
-
#from googletrans import Translator
|
| 7 |
from deep_translator import GoogleTranslator
|
| 8 |
import re
|
| 9 |
import numpy as np
|
| 10 |
import cv2
|
| 11 |
import unicodedata
|
| 12 |
import io
|
| 13 |
-
import uvicorn
|
| 14 |
-
from fastapi import UploadFile, File
|
| 15 |
from pydantic import BaseModel
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
# Fix language detection randomness
|
| 20 |
DetectorFactory.seed = 0
|
| 21 |
|
|
@@ -93,38 +88,20 @@ def extract_field_from_lines(lines, patterns):
|
|
| 93 |
def extract_invoice_fields(text):
|
| 94 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 95 |
invoice_number_patterns = [
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
# Generic Invoice No. / Invoice #
|
| 104 |
-
r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
|
| 105 |
-
|
| 106 |
-
# Receipt patterns
|
| 107 |
-
r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
|
| 108 |
-
|
| 109 |
-
# Generic # prefix
|
| 110 |
-
r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
|
| 111 |
-
|
| 112 |
-
# Order after Receipt
|
| 113 |
-
r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})'
|
| 114 |
-
]
|
| 115 |
-
|
| 116 |
-
|
| 117 |
|
| 118 |
-
# Context-aware patterns first (with "date" keywords)
|
| 119 |
date_patterns = [
|
| 120 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
|
| 121 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
|
| 122 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
|
| 123 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 124 |
-
r'(?:receipt\s*date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 125 |
]
|
| 126 |
|
| 127 |
-
# Fallback patterns (no keywords, match only if above fail)
|
| 128 |
fallback_date_patterns = [
|
| 129 |
r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
|
| 130 |
r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
|
|
@@ -133,23 +110,15 @@ def extract_invoice_fields(text):
|
|
| 133 |
r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
|
| 134 |
]
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
amount_patterns = [
|
| 140 |
r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
|
| 141 |
-
|
| 142 |
-
#r'[\s:](\d{3,6}\.\d{2})[\s]*$',
|
| 143 |
-
#r'(?i)(?:total\s*(?:value|due)?|invoice\s*value)\s*[:\-]?\s*(?:₹|Rs\.?|INR)?\s*([\d,.]+)', # Added this pattern
|
| 144 |
-
r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b', # Added this pattern
|
| 145 |
-
#r'(?i)(total\s*(amount|value|due)?|invoice\s*value|grand\s*total)[:\-]?\s*(₹|Rs\.?|INR)?\s*([\d,.]+)',
|
| 146 |
-
r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b'
|
| 147 |
-
|
| 148 |
]
|
| 149 |
|
| 150 |
invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
|
| 151 |
invoice_date = extract_field_from_lines(lines, date_patterns) or extract_field_from_lines(lines, fallback_date_patterns)
|
| 152 |
total_amount = extract_field_from_lines(lines, amount_patterns)
|
|
|
|
| 153 |
if not total_amount:
|
| 154 |
numbers = []
|
| 155 |
for line in lines:
|
|
@@ -157,44 +126,53 @@ def extract_invoice_fields(text):
|
|
| 157 |
numbers += [float(m.replace(',', '')) for m in matches if m]
|
| 158 |
if numbers:
|
| 159 |
total_amount = f"{max(numbers):.2f}"
|
|
|
|
| 160 |
return {
|
| 161 |
"invoice_number": invoice_number,
|
| 162 |
"invoice_date": invoice_date,
|
| 163 |
"total_amount": total_amount
|
| 164 |
}
|
| 165 |
|
| 166 |
-
# ------------------ API
|
| 167 |
class ImagePayload(BaseModel):
|
| 168 |
image: str
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
@app.post("/predict")
|
| 171 |
async def predict(payload: ImagePayload):
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
|
|
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
processed_img = preprocess_image(image)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
|
|
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
|
| 190 |
-
|
| 191 |
-
"
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
port = int(os.environ.get("PORT", 8080))
|
| 200 |
-
#uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
import base64
|
| 3 |
from PIL import Image, ImageEnhance
|
| 4 |
import pytesseract
|
| 5 |
from langdetect import detect, DetectorFactory
|
|
|
|
| 6 |
from deep_translator import GoogleTranslator
|
| 7 |
import re
|
| 8 |
import numpy as np
|
| 9 |
import cv2
|
| 10 |
import unicodedata
|
| 11 |
import io
|
|
|
|
|
|
|
| 12 |
from pydantic import BaseModel
|
| 13 |
|
|
|
|
|
|
|
| 14 |
# Fix language detection randomness
|
| 15 |
DetectorFactory.seed = 0
|
| 16 |
|
|
|
|
| 88 |
def extract_invoice_fields(text):
|
| 89 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 90 |
invoice_number_patterns = [
|
| 91 |
+
r'(?i)(?:invoice\s*(?:number|no)?\.?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/]{4,})',
|
| 92 |
+
r'(?:invoice\s*(?:number|no|nos|na|#)?\s*[:\-\=\.]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{3,})',
|
| 93 |
+
r'(?:receipt\s*(?:number|no|#)?\s*[:\-]?\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})',
|
| 94 |
+
r'(?:^|\s)#\s*([A-Z0-9][A-Z0-9\-_/\.]{2,})',
|
| 95 |
+
r'(?:order\s*)([A-Z0-9][A-Z0-9\-_/\.]{2,})'
|
| 96 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
|
|
|
| 98 |
date_patterns = [
|
| 99 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})',
|
| 100 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*([A-Za-z]{3,9}[ ]?\d{1,2},?[ ]?\d{4})',
|
| 101 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
|
| 102 |
r'(?:invoice\s*date|bill\s*date|receipt\s*date|date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
|
|
|
| 103 |
]
|
| 104 |
|
|
|
|
| 105 |
fallback_date_patterns = [
|
| 106 |
r'\b(\d{1,2}\s[A-Za-z]{3,9}\s?\d{2,4})\b',
|
| 107 |
r'\b(\d{1,2}[/-][A-Za-z]{3,9}[/-]?\d{2,4})\b',
|
|
|
|
| 110 |
r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
|
| 111 |
]
|
| 112 |
|
|
|
|
|
|
|
|
|
|
| 113 |
amount_patterns = [
|
| 114 |
r'(?:total\s*amount|grand\s*total|amount\s*payable|net\s*amount|total|rounding)\s*[:\-]?\s*\₹?\s*([\d,]+\.\d{2})',
|
| 115 |
+
r'\b(₹|Rs\.?|INR)\s*([\d,]+\.\d{2})\b',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
]
|
| 117 |
|
| 118 |
invoice_number = extract_field_from_lines(lines, invoice_number_patterns)
|
| 119 |
invoice_date = extract_field_from_lines(lines, date_patterns) or extract_field_from_lines(lines, fallback_date_patterns)
|
| 120 |
total_amount = extract_field_from_lines(lines, amount_patterns)
|
| 121 |
+
|
| 122 |
if not total_amount:
|
| 123 |
numbers = []
|
| 124 |
for line in lines:
|
|
|
|
| 126 |
numbers += [float(m.replace(',', '')) for m in matches if m]
|
| 127 |
if numbers:
|
| 128 |
total_amount = f"{max(numbers):.2f}"
|
| 129 |
+
|
| 130 |
return {
|
| 131 |
"invoice_number": invoice_number,
|
| 132 |
"invoice_date": invoice_date,
|
| 133 |
"total_amount": total_amount
|
| 134 |
}
|
| 135 |
|
| 136 |
+
# ------------------ API ENDPOINTS ------------------
|
| 137 |
class ImagePayload(BaseModel):
|
| 138 |
image: str
|
| 139 |
|
| 140 |
+
@app.get("/")
|
| 141 |
+
def read_root():
|
| 142 |
+
return {"status": "ok", "message": "Invoice OCR API is running!"}
|
| 143 |
+
|
| 144 |
@app.post("/predict")
|
| 145 |
async def predict(payload: ImagePayload):
|
| 146 |
+
try:
|
| 147 |
+
img_base64 = payload.image
|
| 148 |
+
if not img_base64:
|
| 149 |
+
return {"error": "No image provided"}
|
| 150 |
|
| 151 |
+
# Remove base64 prefix if present
|
| 152 |
+
if img_base64.startswith("data:image"):
|
| 153 |
+
img_base64 = img_base64.split(",")[1]
|
|
|
|
| 154 |
|
| 155 |
+
# Decode base64 to image
|
| 156 |
+
image_bytes = base64.b64decode(img_base64)
|
| 157 |
+
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 158 |
|
| 159 |
+
# Preprocess
|
| 160 |
+
processed_img = preprocess_image(image)
|
| 161 |
|
| 162 |
+
# OCR + Translation
|
| 163 |
+
text_data = perform_ocr(processed_img)
|
| 164 |
|
| 165 |
+
# Cleaning
|
| 166 |
+
cleaned_text = clean_ocr_text(text_data["translated_text"] or text_data["original_text"])
|
| 167 |
+
|
| 168 |
+
# Extraction
|
| 169 |
+
fields = extract_invoice_fields(cleaned_text)
|
| 170 |
|
| 171 |
+
return {
|
| 172 |
+
"language": text_data["detected_language"],
|
| 173 |
+
"text": cleaned_text,
|
| 174 |
+
"fields": fields
|
| 175 |
+
}
|
| 176 |
|
| 177 |
+
except Exception as e:
|
| 178 |
+
return {"error": str(e)}
|
|
|
|
|
|