ocr-LLM-image-summarizer / image_processor.py
jhonparra18's picture
app behaviour
08f01a2
import cv2
import pytesseract
from config import PYTESSERACT_DEFAULT_CONFIG
from pathlib import Path
from tqdm import tqdm
import numpy as np
from langchain.tools import BaseTool
from typing import Optional, Type
from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
from PIL import Image
class ImageProcessor(BaseTool):
name = "ImageProcessor"
description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."
def binarize(self,img_path):
"""
This function is to binarize an input image
:param img: image in format of (h, w, channel)
:return: am image in format of (h, w)
"""
img=cv2.imread(img_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] #threshold may affect performance for invoices|receipts as seen in our test dataset
return gray
def remove_watermark(self,img,alpha = 1.8,beta = -180):
"""remove watermark from image
img: cv2 image| np.array"""
new = alpha * img + beta
new = np.clip(new, 0, 255).astype(np.uint8)
return new
def deskew(self,img):
coords = np.column_stack(np.where(img > 0))
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = img.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return rotated
def dilate_erode(self,img):
"""
apply an erosion and dilation kernel
img: cv2 image| np.array
"""
kernel = np.ones((2, 1), np.uint8)
kernel2 = np.ones((1, 1), np.uint8)
img = cv2.blur(img,(6,5))
img=cv2.dilate(img, kernel, iterations=3)
img = cv2.erode(img, (2,1), iterations=1)
img = cv2.blur(img,(1,1))
img = cv2.bilateralFilter(img,10,35,30)
img= cv2.dilate(img, kernel2, iterations=1)
return img
def detect_angle(self,img_path):
"""detects angle of rotation in the image using the text lines found"""
##taken from https://stackoverflow.com/questions/13872331/rotating-an-image-with-orientation-specified-in-exif-using-python-without-pil-in
pil_img=Image.open(img_path)
img_exif = pil_img.getexif()
if len(img_exif):
if img_exif[274] == 3:
pil_img = pil_img.transpose(Image.ROTATE_180)
elif img_exif[274] == 6:
pil_img = pil_img.transpose(Image.ROTATE_270)
elif img_exif[274] == 8:
pil_img = pil_img.transpose(Image.ROTATE_90)
return np.array(pil_img)[:, :, ::-1] #convert to BGR
def opening(self,image):
kernel = np.ones((5,5),np.uint8)
return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
def process_image(self,img_path):
img=self.binarize(img_path)
img=self.remove_watermark(img)
return img
def img_to_text(self,img,lang="spa"):
text=pytesseract.image_to_string(img,lang=lang,config=PYTESSERACT_DEFAULT_CONFIG)
return text
def _run(self,img_path,save_to_disk=False):
img=self.process_image(img_path)
text=self.img_to_text(img)
if save_to_disk:
with open(f"/tmp/{str(img_path).split('/')[-1].replace('.jpg','.txt')}",'w') as f:
f.write(text)
cv2.imwrite(f"images/rotated-{img_pth.name}",img)
return text
# as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
async def _arun(self, img_path: str,save_to_disk=False, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> str:
"""Use the tool asynchronously."""
raise NotImplementedError("does not support async")