Spaces:
Sleeping
Sleeping
File size: 3,667 Bytes
dd3a079 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# -*- coding: utf-8 -*-
"""(Deployment)2.1 counting columns.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1R2CszBuVN-Rugu8CyGQzqsdFw11E3eHN
## Libraries
"""
# from google.colab.patches import cv2_imshow
import cv2
import numpy as np
import pandas as pd
import statistics
from statistics import mode
from PIL import Image
# pip install PyPDF2
# pip install PyMuPDF
# pip install pip install PyMuPDF==1.19.0
import io
# !pip install pypdfium2
import pypdfium2 as pdfium
import fitz # PyMuPDF
import pandas as pd
import pilecaps_adr
"""# Functions"""
def get_text_from_pdf(input_pdf_path):
pdf_document = fitz.open('dropbox_plans/2.1/'+input_pdf_path)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
text_instances = page.get_text("words")
page.apply_redactions()
return text_instances
def convert2img(path):
pdf = pdfium.PdfDocument('dropbox_plans/2.1/'+path)
page = pdf.get_page(0)
pil_image = page.render().to_pil()
pl1=np.array(pil_image)
img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR)
return img
def segment(img):
lowerRange1 = np.array([0, 9, 0])
upperRange1 = np.array([81, 255, 255])
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
mask2 = cv2.inRange(hsv, lowerRange1, upperRange1)
imgResult3 = cv2.bitwise_and(img, img, mask=mask2)
return imgResult3
def threshold(imgResult3):
gaus = cv2.GaussianBlur(imgResult3, (3,3),9)
gray2 = cv2.cvtColor(gaus, cv2.COLOR_BGR2GRAY)
outsu2 = cv2.threshold(gray2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
return outsu2
# Deleted the image drawing
def getColumnsPoints(outsu4):
contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE)
p = []
for i, cnt in enumerate(contours):
M = cv2.moments(cnt)
if M['m00'] != 0.0:
x1 = int(M['m10']/M['m00'])
y1 = int(M['m01']/M['m00'])
p.append((x1,y1))
return p
def getTextsPoints(x):
point_list = []
for h in x:
point_list.append((h[2],h[3]))
return point_list
def distance(point1, point2):
x1, y1 = point1
x2, y2 = point2
return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
def getNearestText(point_list, p):
nearbyy = []
dis = []
for i in range(len(p)):
nearest_point = min(point_list, key=lambda point: distance(point, p[i]))
dist = distance(nearest_point, p[i])
dis.append(dist)
if dist < 44:
nearbyy.append(nearest_point)
return nearbyy
def getColumnsTypes(nearbyy, x):
found_tuple = []
# Loop through the list of tuples
for i in range(len(nearbyy)):
for tpl in x:
if tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]:
found_tuple.append(tpl[4])
return found_tuple
def generate_legend(found_tuple):
word_freq = {}
for word in found_tuple:
if word in word_freq:
word_freq[word] += 1
else:
word_freq[word] = 1
data = word_freq
df = pd.DataFrame(data.items(), columns=['Column Type', 'Count'])
return df
def mainfun(plan,pathtoplan):
texts_from_pdf = get_text_from_pdf(plan)
img = convert2img(plan)
imgResult = segment(img)
outsu = threshold(imgResult)
column_points = getColumnsPoints(outsu)
text_points = getTextsPoints(texts_from_pdf)
nearby = getNearestText(text_points, column_points)
columns_types = getColumnsTypes(nearby, texts_from_pdf)
legend = generate_legend(columns_types)
gc,spreadsheet_service,spreadsheetId ,spreadsheet_url , namepathArr=pilecaps_adr.legendGoogleSheets(legend,path=plan,pdfpath=pathtoplan)
return spreadsheet_url
"""# Call"""
|