# -*- coding: utf-8 -*- """(Deployment)2.1 counting columns.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1R2CszBuVN-Rugu8CyGQzqsdFw11E3eHN ## Libraries """ # from google.colab.patches import cv2_imshow import cv2 import numpy as np import pandas as pd import statistics from statistics import mode from PIL import Image # pip install PyPDF2 # pip install PyMuPDF # pip install pip install PyMuPDF==1.19.0 import io # !pip install pypdfium2 import pypdfium2 as pdfium import fitz # PyMuPDF import pandas as pd import pilecaps_adr """# Functions""" def get_text_from_pdf(input_pdf_path): pdf_document = fitz.open('dropbox_plans/2.1/'+input_pdf_path) for page_num in range(pdf_document.page_count): page = pdf_document[page_num] text_instances = page.get_text("words") page.apply_redactions() return text_instances def convert2img(path): pdf = pdfium.PdfDocument('dropbox_plans/2.1/'+path) page = pdf.get_page(0) pil_image = page.render().to_pil() pl1=np.array(pil_image) img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR) return img def segment(img): lowerRange1 = np.array([0, 9, 0]) upperRange1 = np.array([81, 255, 255]) hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) mask2 = cv2.inRange(hsv, lowerRange1, upperRange1) imgResult3 = cv2.bitwise_and(img, img, mask=mask2) return imgResult3 def threshold(imgResult3): gaus = cv2.GaussianBlur(imgResult3, (3,3),9) gray2 = cv2.cvtColor(gaus, cv2.COLOR_BGR2GRAY) outsu2 = cv2.threshold(gray2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] return outsu2 # Deleted the image drawing def getColumnsPoints(outsu4): contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE) p = [] for i, cnt in enumerate(contours): M = cv2.moments(cnt) if M['m00'] != 0.0: x1 = int(M['m10']/M['m00']) y1 = int(M['m01']/M['m00']) p.append((x1,y1)) return p def getTextsPoints(x): point_list = [] for h in x: point_list.append((h[2],h[3])) return point_list def distance(point1, point2): x1, y1 = point1 x2, y2 = point2 return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) def getNearestText(point_list, p): nearbyy = [] dis = [] for i in range(len(p)): nearest_point = min(point_list, key=lambda point: distance(point, p[i])) dist = distance(nearest_point, p[i]) dis.append(dist) if dist < 44: nearbyy.append(nearest_point) return nearbyy def getColumnsTypes(nearbyy, x): found_tuple = [] # Loop through the list of tuples for i in range(len(nearbyy)): for tpl in x: if tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]: found_tuple.append(tpl[4]) return found_tuple def generate_legend(found_tuple): word_freq = {} for word in found_tuple: if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 data = word_freq df = pd.DataFrame(data.items(), columns=['Column Type', 'Count']) return df def mainfun(plan,pathtoplan): texts_from_pdf = get_text_from_pdf(plan) img = convert2img(plan) imgResult = segment(img) outsu = threshold(imgResult) column_points = getColumnsPoints(outsu) text_points = getTextsPoints(texts_from_pdf) nearby = getNearestText(text_points, column_points) columns_types = getColumnsTypes(nearby, texts_from_pdf) legend = generate_legend(columns_types) gc,spreadsheet_service,spreadsheetId ,spreadsheet_url , namepathArr=pilecaps_adr.legendGoogleSheets(legend,path=plan,pdfpath=pathtoplan) return spreadsheet_url """# Call"""