Spaces:

Marthee
/

PileCaps

Sleeping

File size: 2,200 Bytes

23b88e9

import cv2
import numpy as np
import PyPDF2
import fitz
from PIL import Image



def rmv_dashedLines(clean_img1): #ip numpy array without text
    clean_img=np.array(clean_img1)

    img_copy=clean_img.copy()
    
    kernel1 = np.ones((3,5),np.uint8)
    kernel2 = np.ones((9,9),np.uint8)
    
    imgGray=cv2.cvtColor(clean_img,cv2.COLOR_BGR2GRAY)
    imgBW=cv2.threshold(imgGray, 200, 255, cv2.THRESH_BINARY_INV)[1]
    
    img1=cv2.erode(imgBW, kernel1, iterations=1)
    img2=cv2.dilate(img1, kernel2, iterations=3)
    img3 = cv2.bitwise_and(imgBW,img2)
    img3= cv2.bitwise_not(img3)
    img4 = cv2.bitwise_and(imgBW,imgBW,mask=img3)
    h,w,c=clean_img.shape
    
    
    
    #adjust length of center line based on h,w of img
   
    imgLines= cv2.HoughLinesP(img4,1,np.pi/180,200,minLineLength=(w-h),maxLineGap = 120) 
    
    
    for i in range(len(imgLines)):
        for x1,y1,x2,y2 in imgLines[i]:
            cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2)
        
    #adjust length of center line based on h,w of img
    imgLines= cv2.HoughLinesP(img4,1,np.pi/180,150,minLineLength=w//4,maxLineGap = 90) 
    
    im_copy=imgBW.copy()
    for i in range(len(imgLines)):
        for x1,y1,x2,y2 in imgLines[i]:
            if x1>4050 or y1>4050:
              cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2)
    green=clean_img[:,:,1]
    
    eroded=cv2.erode(green, kernel2, iterations=1)
    dilated=cv2.dilate(eroded, kernel2, iterations=1)
    
    return dilated

    
####################################################################################################
def rmv_text(plan):

#EXTRACT TEXT FROM PDF TO REMOVE IT
    file=open(plan,'rb')
    text=PyPDF2.PdfReader(file).pages[0].extract_text()
    
    pdf=fitz.open(plan)
    
    page=pdf.load_page(0)
    
    for item in list(text):
      draft=page.search_for(item)
      for rect in draft:
          annot = page.add_redact_annot(rect)
          page.apply_redactions() #delete all text in redactions
          page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
    pix=page.get_pixmap()
    clean_img=Image.frombytes('RGB',[pix.width,pix.height],pix.samples)
    
    return clean_img