Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """(Deployment)2.1 counting columns.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1R2CszBuVN-Rugu8CyGQzqsdFw11E3eHN | |
| ## Libraries | |
| """ | |
| # from google.colab.patches import cv2_imshow | |
| import cv2 | |
| import numpy as np | |
| import pandas as pd | |
| import statistics | |
| from statistics import mode | |
| from PIL import Image | |
| # pip install PyPDF2 | |
| # pip install PyMuPDF | |
| # pip install pip install PyMuPDF==1.19.0 | |
| import io | |
| # !pip install pypdfium2 | |
| import pypdfium2 as pdfium | |
| import fitz # PyMuPDF | |
| import pandas as pd | |
| import pilecaps_adr | |
| """# Functions""" | |
| def get_text_from_pdf(input_pdf_path): | |
| pdf_document = fitz.open('dropbox_plans/2.1/'+input_pdf_path) | |
| for page_num in range(pdf_document.page_count): | |
| page = pdf_document[page_num] | |
| text_instances = page.get_text("words") | |
| page.apply_redactions() | |
| return text_instances | |
| def convert2img(path): | |
| pdf = pdfium.PdfDocument('dropbox_plans/2.1/'+path) | |
| page = pdf.get_page(0) | |
| pil_image = page.render().to_pil() | |
| pl1=np.array(pil_image) | |
| img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR) | |
| return img | |
| def segment(img): | |
| lowerRange1 = np.array([0, 9, 0]) | |
| upperRange1 = np.array([81, 255, 255]) | |
| hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) | |
| mask2 = cv2.inRange(hsv, lowerRange1, upperRange1) | |
| imgResult3 = cv2.bitwise_and(img, img, mask=mask2) | |
| return imgResult3 | |
| def threshold(imgResult3): | |
| gaus = cv2.GaussianBlur(imgResult3, (3,3),9) | |
| gray2 = cv2.cvtColor(gaus, cv2.COLOR_BGR2GRAY) | |
| outsu2 = cv2.threshold(gray2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] | |
| return outsu2 | |
| # Deleted the image drawing | |
| def getColumnsPoints(outsu4): | |
| contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE) | |
| p = [] | |
| for i, cnt in enumerate(contours): | |
| M = cv2.moments(cnt) | |
| if M['m00'] != 0.0: | |
| x1 = int(M['m10']/M['m00']) | |
| y1 = int(M['m01']/M['m00']) | |
| p.append((x1,y1)) | |
| return p | |
| def getTextsPoints(x): | |
| point_list = [] | |
| for h in x: | |
| point_list.append((h[2],h[3])) | |
| return point_list | |
| def distance(point1, point2): | |
| x1, y1 = point1 | |
| x2, y2 = point2 | |
| return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) | |
| def getNearestText(point_list, p): | |
| nearbyy = [] | |
| dis = [] | |
| for i in range(len(p)): | |
| nearest_point = min(point_list, key=lambda point: distance(point, p[i])) | |
| dist = distance(nearest_point, p[i]) | |
| dis.append(dist) | |
| if dist < 44: | |
| nearbyy.append(nearest_point) | |
| return nearbyy | |
| def getColumnsTypes(nearbyy, x): | |
| found_tuple = [] | |
| # Loop through the list of tuples | |
| for i in range(len(nearbyy)): | |
| for tpl in x: | |
| if tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]: | |
| found_tuple.append(tpl[4]) | |
| return found_tuple | |
| def generate_legend(found_tuple): | |
| word_freq = {} | |
| for word in found_tuple: | |
| if word in word_freq: | |
| word_freq[word] += 1 | |
| else: | |
| word_freq[word] = 1 | |
| data = word_freq | |
| df = pd.DataFrame(data.items(), columns=['Column Type', 'Count']) | |
| return df | |
| def mainfun(plan,pathtoplan): | |
| texts_from_pdf = get_text_from_pdf(plan) | |
| img = convert2img(plan) | |
| imgResult = segment(img) | |
| outsu = threshold(imgResult) | |
| column_points = getColumnsPoints(outsu) | |
| text_points = getTextsPoints(texts_from_pdf) | |
| nearby = getNearestText(text_points, column_points) | |
| columns_types = getColumnsTypes(nearby, texts_from_pdf) | |
| legend = generate_legend(columns_types) | |
| gc,spreadsheet_service,spreadsheetId ,spreadsheet_url , namepathArr=pilecaps_adr.legendGoogleSheets(legend,path=plan,pdfpath=pathtoplan) | |
| return spreadsheet_url | |
| """# Call""" | |