Spaces:

Sabithulla
/

expenses_ai

Sleeping

Update ocr.py

c42c38e verified about 2 months ago

857 Bytes

	import cv2
	import numpy as np
	import re
	import pytesseract

	pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

	def preprocess(image_bytes):
	np_arr = np.frombuffer(image_bytes, np.uint8)
	img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)

	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	blur = cv2.GaussianBlur(gray, (5,5), 0)
	thresh = cv2.adaptiveThreshold(
	blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY, 11, 2
	)

	return thresh

	def extract_total(text):
	matches = re.findall(r'\d+\.\d{2}', text)
	return max([float(m) for m in matches], default=0)

	def scan_receipt(image_bytes):
	processed = preprocess(image_bytes)
	text = pytesseract.image_to_string(processed, config='--psm 6')

	total = extract_total(text)

	return {
	"raw_text": text,
	"total": total
	}