import cv2 import numpy as np import PyPDF2 import fitz from PIL import Image def rmv_dashedLines(clean_img1): #ip numpy array without text clean_img=np.array(clean_img1) img_copy=clean_img.copy() kernel1 = np.ones((3,5),np.uint8) kernel2 = np.ones((9,9),np.uint8) imgGray=cv2.cvtColor(clean_img,cv2.COLOR_BGR2GRAY) imgBW=cv2.threshold(imgGray, 200, 255, cv2.THRESH_BINARY_INV)[1] img1=cv2.erode(imgBW, kernel1, iterations=1) img2=cv2.dilate(img1, kernel2, iterations=3) img3 = cv2.bitwise_and(imgBW,img2) img3= cv2.bitwise_not(img3) img4 = cv2.bitwise_and(imgBW,imgBW,mask=img3) h,w,c=clean_img.shape #adjust length of center line based on h,w of img imgLines= cv2.HoughLinesP(img4,1,np.pi/180,200,minLineLength=(w-h),maxLineGap = 120) for i in range(len(imgLines)): for x1,y1,x2,y2 in imgLines[i]: cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2) #adjust length of center line based on h,w of img imgLines= cv2.HoughLinesP(img4,1,np.pi/180,150,minLineLength=w//4,maxLineGap = 90) im_copy=imgBW.copy() for i in range(len(imgLines)): for x1,y1,x2,y2 in imgLines[i]: if x1>4050 or y1>4050: cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2) green=clean_img[:,:,1] eroded=cv2.erode(green, kernel2, iterations=1) dilated=cv2.dilate(eroded, kernel2, iterations=1) return dilated #################################################################################################### def rmv_text(plan): #EXTRACT TEXT FROM PDF TO REMOVE IT file=open(plan,'rb') text=PyPDF2.PdfReader(file).pages[0].extract_text() pdf=fitz.open(plan) page=pdf.load_page(0) for item in list(text): draft=page.search_for(item) for rect in draft: annot = page.add_redact_annot(rect) page.apply_redactions() #delete all text in redactions page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE) pix=page.get_pixmap() clean_img=Image.frombytes('RGB',[pix.width,pix.height],pix.samples) return clean_img