File size: 857 Bytes
dd24fc6
 
 
c42c38e
 
 
dd24fc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import cv2
import numpy as np
import re
import pytesseract

pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

def preprocess(image_bytes):
    np_arr = np.frombuffer(image_bytes, np.uint8)
    img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5,5), 0)
    thresh = cv2.adaptiveThreshold(
        blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 11, 2
    )

    return thresh

def extract_total(text):
    matches = re.findall(r'\d+\.\d{2}', text)
    return max([float(m) for m in matches], default=0)

def scan_receipt(image_bytes):
    processed = preprocess(image_bytes)
    text = pytesseract.image_to_string(processed, config='--psm 6')

    total = extract_total(text)

    return {
        "raw_text": text,
        "total": total
    }