Spaces:

Marthee
/

PileCaps

Sleeping

App Files Files Community

PileCaps / PreprocessingFoundation.py

Marthee

Create PreprocessingFoundation.py

23b88e9 almost 3 years ago

raw

history blame contribute delete

2.2 kB

	import cv2
	import numpy as np
	import PyPDF2
	import fitz
	from PIL import Image



	def rmv_dashedLines(clean_img1): #ip numpy array without text
	clean_img=np.array(clean_img1)

	img_copy=clean_img.copy()

	kernel1 = np.ones((3,5),np.uint8)
	kernel2 = np.ones((9,9),np.uint8)

	imgGray=cv2.cvtColor(clean_img,cv2.COLOR_BGR2GRAY)
	imgBW=cv2.threshold(imgGray, 200, 255, cv2.THRESH_BINARY_INV)[1]

	img1=cv2.erode(imgBW, kernel1, iterations=1)
	img2=cv2.dilate(img1, kernel2, iterations=3)
	img3 = cv2.bitwise_and(imgBW,img2)
	img3= cv2.bitwise_not(img3)
	img4 = cv2.bitwise_and(imgBW,imgBW,mask=img3)
	h,w,c=clean_img.shape



	#adjust length of center line based on h,w of img

	imgLines= cv2.HoughLinesP(img4,1,np.pi/180,200,minLineLength=(w-h),maxLineGap = 120)


	for i in range(len(imgLines)):
	for x1,y1,x2,y2 in imgLines[i]:
	cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2)

	#adjust length of center line based on h,w of img
	imgLines= cv2.HoughLinesP(img4,1,np.pi/180,150,minLineLength=w//4,maxLineGap = 90)

	im_copy=imgBW.copy()
	for i in range(len(imgLines)):
	for x1,y1,x2,y2 in imgLines[i]:
	if x1>4050 or y1>4050:
	cv2.line(clean_img,(x1,y1),(x2,y2),(0,255,0),2)
	green=clean_img[:,:,1]

	eroded=cv2.erode(green, kernel2, iterations=1)
	dilated=cv2.dilate(eroded, kernel2, iterations=1)

	return dilated


	####################################################################################################
	def rmv_text(plan):

	#EXTRACT TEXT FROM PDF TO REMOVE IT
	file=open(plan,'rb')
	text=PyPDF2.PdfReader(file).pages[0].extract_text()

	pdf=fitz.open(plan)

	page=pdf.load_page(0)

	for item in list(text):
	draft=page.search_for(item)
	for rect in draft:
	annot = page.add_redact_annot(rect)
	page.apply_redactions() #delete all text in redactions
	page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
	pix=page.get_pixmap()
	clean_img=Image.frombytes('RGB',[pix.width,pix.height],pix.samples)

	return clean_img