PileCaps / PreprocessingFoundation.py
Marthee's picture
Create PreprocessingFoundation.py
23b88e9
import cv2
import numpy as np
import PyPDF2
import fitz
from PIL import Image
def rmv_dashedLines(clean_img1): #ip numpy array without text
clean_img=np.array(clean_img1)
img_copy=clean_img.copy()
kernel1 = np.ones((3,5),np.uint8)
kernel2 = np.ones((9,9),np.uint8)
imgGray=cv2.cvtColor(clean_img,cv2.COLOR_BGR2GRAY)
imgBW=cv2.threshold(imgGray, 200, 255, cv2.THRESH_BINARY_INV)[1]
img1=cv2.erode(imgBW, kernel1, iterations=1)
img2=cv2.dilate(img1, kernel2, iterations=3)
img3 = cv2.bitwise_and(imgBW,img2)
img3= cv2.bitwise_not(img3)
img4 = cv2.bitwise_and(imgBW,imgBW,mask=img3)
h,w,c=clean_img.shape
#adjust length of center line based on h,w of img
imgLines= cv2.HoughLinesP(img4,1,np.pi/180,200,minLineLength=(w-h),maxLineGap = 120)
for i in range(len(imgLines)):
for x1,y1,x2,y2 in imgLines[i]:
cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2)
#adjust length of center line based on h,w of img
imgLines= cv2.HoughLinesP(img4,1,np.pi/180,150,minLineLength=w//4,maxLineGap = 90)
im_copy=imgBW.copy()
for i in range(len(imgLines)):
for x1,y1,x2,y2 in imgLines[i]:
if x1>4050 or y1>4050:
cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2)
green=clean_img[:,:,1]
eroded=cv2.erode(green, kernel2, iterations=1)
dilated=cv2.dilate(eroded, kernel2, iterations=1)
return dilated
####################################################################################################
def rmv_text(plan):
#EXTRACT TEXT FROM PDF TO REMOVE IT
file=open(plan,'rb')
text=PyPDF2.PdfReader(file).pages[0].extract_text()
pdf=fitz.open(plan)
page=pdf.load_page(0)
for item in list(text):
draft=page.search_for(item)
for rect in draft:
annot = page.add_redact_annot(rect)
page.apply_redactions() #delete all text in redactions
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
pix=page.get_pixmap()
clean_img=Image.frombytes('RGB',[pix.width,pix.height],pix.samples)
return clean_img