File size: 3,667 Bytes
dd3a079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""(Deployment)2.1 counting columns.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1R2CszBuVN-Rugu8CyGQzqsdFw11E3eHN

## Libraries
"""

# from google.colab.patches import cv2_imshow
import cv2
import numpy as np
import pandas as pd

import statistics
from statistics import mode

from PIL import Image

# pip install PyPDF2

# pip install PyMuPDF

# pip install pip install PyMuPDF==1.19.0

import io

# !pip install pypdfium2
import pypdfium2 as pdfium

import fitz  # PyMuPDF

import pandas as pd
import pilecaps_adr
"""# Functions"""

def get_text_from_pdf(input_pdf_path):
    pdf_document = fitz.open('dropbox_plans/2.1/'+input_pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        text_instances = page.get_text("words")

        page.apply_redactions()
    return text_instances

def convert2img(path):
    pdf = pdfium.PdfDocument('dropbox_plans/2.1/'+path)
    page = pdf.get_page(0)
    pil_image = page.render().to_pil()
    pl1=np.array(pil_image)
    img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR)
    return img

def segment(img):
  lowerRange1 = np.array([0, 9, 0])
  upperRange1 = np.array([81, 255, 255])
  hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
  mask2 = cv2.inRange(hsv, lowerRange1, upperRange1)
  imgResult3 = cv2.bitwise_and(img, img, mask=mask2)
  return imgResult3

def threshold(imgResult3):
  gaus = cv2.GaussianBlur(imgResult3, (3,3),9)
  gray2 = cv2.cvtColor(gaus, cv2.COLOR_BGR2GRAY)
  outsu2 = cv2.threshold(gray2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
  return outsu2

# Deleted the image drawing
def getColumnsPoints(outsu4):
  contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE)
  p = []
  for i, cnt in enumerate(contours):
    M = cv2.moments(cnt)
    if M['m00'] != 0.0:
      x1 = int(M['m10']/M['m00'])
      y1 = int(M['m01']/M['m00'])
    p.append((x1,y1))
  return p

def getTextsPoints(x):
  point_list = []
  for h in x:
    point_list.append((h[2],h[3]))
  return point_list

def distance(point1, point2):
    x1, y1 = point1
    x2, y2 = point2
    return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

def getNearestText(point_list, p):
  nearbyy = []
  dis = []
  for i in range(len(p)):
    nearest_point = min(point_list, key=lambda point: distance(point, p[i]))
    dist = distance(nearest_point, p[i])
    dis.append(dist)
    if dist < 44:
      nearbyy.append(nearest_point)
  return nearbyy

def getColumnsTypes(nearbyy, x):
  found_tuple = []
  # Loop through the list of tuples
  for i in range(len(nearbyy)):
    for tpl in x:
      if tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]:
        found_tuple.append(tpl[4])
  return found_tuple

def generate_legend(found_tuple):
  word_freq = {}
  for word in found_tuple:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1
  data = word_freq
  df = pd.DataFrame(data.items(), columns=['Column Type', 'Count'])
  return df

def mainfun(plan,pathtoplan):
  texts_from_pdf = get_text_from_pdf(plan)
  img = convert2img(plan)
  imgResult = segment(img)
  outsu = threshold(imgResult)
  column_points = getColumnsPoints(outsu)
  text_points = getTextsPoints(texts_from_pdf)
  nearby = getNearestText(text_points, column_points)
  columns_types = getColumnsTypes(nearby, texts_from_pdf)
  legend = generate_legend(columns_types)
  gc,spreadsheet_service,spreadsheetId ,spreadsheet_url  , namepathArr=pilecaps_adr.legendGoogleSheets(legend,path=plan,pdfpath=pathtoplan)
  return spreadsheet_url

"""# Call"""