Marthee commited on
Commit
dd3a079
·
1 Parent(s): a7246a5

Upload tameem2_1.py

Browse files
Files changed (1) hide show
  1. tameem2_1.py +139 -0
tameem2_1.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """(Deployment)2.1 counting columns.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1R2CszBuVN-Rugu8CyGQzqsdFw11E3eHN
8
+
9
+ ## Libraries
10
+ """
11
+
12
+ # from google.colab.patches import cv2_imshow
13
+ import cv2
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ import statistics
18
+ from statistics import mode
19
+
20
+ from PIL import Image
21
+
22
+ # pip install PyPDF2
23
+
24
+ # pip install PyMuPDF
25
+
26
+ # pip install pip install PyMuPDF==1.19.0
27
+
28
+ import io
29
+
30
+ # !pip install pypdfium2
31
+ import pypdfium2 as pdfium
32
+
33
+ import fitz # PyMuPDF
34
+
35
+ import pandas as pd
36
+ import pilecaps_adr
37
+ """# Functions"""
38
+
39
+ def get_text_from_pdf(input_pdf_path):
40
+ pdf_document = fitz.open('dropbox_plans/2.1/'+input_pdf_path)
41
+
42
+ for page_num in range(pdf_document.page_count):
43
+ page = pdf_document[page_num]
44
+ text_instances = page.get_text("words")
45
+
46
+ page.apply_redactions()
47
+ return text_instances
48
+
49
+ def convert2img(path):
50
+ pdf = pdfium.PdfDocument('dropbox_plans/2.1/'+path)
51
+ page = pdf.get_page(0)
52
+ pil_image = page.render().to_pil()
53
+ pl1=np.array(pil_image)
54
+ img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR)
55
+ return img
56
+
57
+ def segment(img):
58
+ lowerRange1 = np.array([0, 9, 0])
59
+ upperRange1 = np.array([81, 255, 255])
60
+ hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
61
+ mask2 = cv2.inRange(hsv, lowerRange1, upperRange1)
62
+ imgResult3 = cv2.bitwise_and(img, img, mask=mask2)
63
+ return imgResult3
64
+
65
+ def threshold(imgResult3):
66
+ gaus = cv2.GaussianBlur(imgResult3, (3,3),9)
67
+ gray2 = cv2.cvtColor(gaus, cv2.COLOR_BGR2GRAY)
68
+ outsu2 = cv2.threshold(gray2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
69
+ return outsu2
70
+
71
+ # Deleted the image drawing
72
+ def getColumnsPoints(outsu4):
73
+ contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE)
74
+ p = []
75
+ for i, cnt in enumerate(contours):
76
+ M = cv2.moments(cnt)
77
+ if M['m00'] != 0.0:
78
+ x1 = int(M['m10']/M['m00'])
79
+ y1 = int(M['m01']/M['m00'])
80
+ p.append((x1,y1))
81
+ return p
82
+
83
+ def getTextsPoints(x):
84
+ point_list = []
85
+ for h in x:
86
+ point_list.append((h[2],h[3]))
87
+ return point_list
88
+
89
+ def distance(point1, point2):
90
+ x1, y1 = point1
91
+ x2, y2 = point2
92
+ return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
93
+
94
+ def getNearestText(point_list, p):
95
+ nearbyy = []
96
+ dis = []
97
+ for i in range(len(p)):
98
+ nearest_point = min(point_list, key=lambda point: distance(point, p[i]))
99
+ dist = distance(nearest_point, p[i])
100
+ dis.append(dist)
101
+ if dist < 44:
102
+ nearbyy.append(nearest_point)
103
+ return nearbyy
104
+
105
+ def getColumnsTypes(nearbyy, x):
106
+ found_tuple = []
107
+ # Loop through the list of tuples
108
+ for i in range(len(nearbyy)):
109
+ for tpl in x:
110
+ if tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]:
111
+ found_tuple.append(tpl[4])
112
+ return found_tuple
113
+
114
+ def generate_legend(found_tuple):
115
+ word_freq = {}
116
+ for word in found_tuple:
117
+ if word in word_freq:
118
+ word_freq[word] += 1
119
+ else:
120
+ word_freq[word] = 1
121
+ data = word_freq
122
+ df = pd.DataFrame(data.items(), columns=['Column Type', 'Count'])
123
+ return df
124
+
125
+ def mainfun(plan,pathtoplan):
126
+ texts_from_pdf = get_text_from_pdf(plan)
127
+ img = convert2img(plan)
128
+ imgResult = segment(img)
129
+ outsu = threshold(imgResult)
130
+ column_points = getColumnsPoints(outsu)
131
+ text_points = getTextsPoints(texts_from_pdf)
132
+ nearby = getNearestText(text_points, column_points)
133
+ columns_types = getColumnsTypes(nearby, texts_from_pdf)
134
+ legend = generate_legend(columns_types)
135
+ gc,spreadsheet_service,spreadsheetId ,spreadsheet_url , namepathArr=pilecaps_adr.legendGoogleSheets(legend,path=plan,pdfpath=pathtoplan)
136
+ return spreadsheet_url
137
+
138
+ """# Call"""
139
+