Spaces:

jhonparra18
/

ocr-LLM-image-summarizer

Runtime error

App Files Files Community

ocr-LLM-image-summarizer / image_processor.py

jhonparra18

app behaviour

08f01a2 over 2 years ago

raw

history blame contribute delete

4.16 kB

	import cv2
	import pytesseract
	from config import PYTESSERACT_DEFAULT_CONFIG
	from pathlib import Path
	from tqdm import tqdm
	import numpy as np
	from langchain.tools import BaseTool
	from typing import Optional, Type
	from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
	from PIL import Image

	class ImageProcessor(BaseTool):

	name = "ImageProcessor"
	description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."

	def binarize(self,img_path):
	"""
	This function is to binarize an input image
	:param img: image in format of (h, w, channel)
	:return: am image in format of (h, w)
	"""
	img=cv2.imread(img_path)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	#gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] #threshold may affect performance for invoices\|receipts as seen in our test dataset
	return gray

	def remove_watermark(self,img,alpha = 1.8,beta = -180):
	"""remove watermark from image
	img: cv2 image\| np.array"""
	new = alpha * img + beta
	new = np.clip(new, 0, 255).astype(np.uint8)
	return new

	def deskew(self,img):
	coords = np.column_stack(np.where(img > 0))
	angle = cv2.minAreaRect(coords)[-1]
	if angle < -45:
	angle = -(90 + angle)
	else:
	angle = -angle
	(h, w) = img.shape[:2]
	center = (w // 2, h // 2)
	M = cv2.getRotationMatrix2D(center, angle, 1.0)
	rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
	return rotated

	def dilate_erode(self,img):
	"""
	apply an erosion and dilation kernel
	img: cv2 image\| np.array
	"""
	kernel = np.ones((2, 1), np.uint8)
	kernel2 = np.ones((1, 1), np.uint8)
	img = cv2.blur(img,(6,5))
	img=cv2.dilate(img, kernel, iterations=3)
	img = cv2.erode(img, (2,1), iterations=1)
	img = cv2.blur(img,(1,1))
	img = cv2.bilateralFilter(img,10,35,30)
	img= cv2.dilate(img, kernel2, iterations=1)
	return img


	def detect_angle(self,img_path):
	"""detects angle of rotation in the image using the text lines found"""
	##taken from https://stackoverflow.com/questions/13872331/rotating-an-image-with-orientation-specified-in-exif-using-python-without-pil-in
	pil_img=Image.open(img_path)
	img_exif = pil_img.getexif()
	if len(img_exif):
	if img_exif[274] == 3:
	pil_img = pil_img.transpose(Image.ROTATE_180)
	elif img_exif[274] == 6:
	pil_img = pil_img.transpose(Image.ROTATE_270)
	elif img_exif[274] == 8:
	pil_img = pil_img.transpose(Image.ROTATE_90)

	return np.array(pil_img)[:, :, ::-1] #convert to BGR

	def opening(self,image):
	kernel = np.ones((5,5),np.uint8)
	return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

	def process_image(self,img_path):
	img=self.binarize(img_path)
	img=self.remove_watermark(img)
	return img

	def img_to_text(self,img,lang="spa"):
	text=pytesseract.image_to_string(img,lang=lang,config=PYTESSERACT_DEFAULT_CONFIG)
	return text

	def _run(self,img_path,save_to_disk=False):
	img=self.process_image(img_path)
	text=self.img_to_text(img)
	if save_to_disk:
	with open(f"/tmp/{str(img_path).split('/')[-1].replace('.jpg','.txt')}",'w') as f:
	f.write(text)
	cv2.imwrite(f"images/rotated-{img_pth.name}",img)
	return text

	# as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
	async def _arun(self, img_path: str,save_to_disk=False, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
	) -> str:
	"""Use the tool asynchronously."""
	raise NotImplementedError("does not support async")