Marthee commited on
Commit
e5cf808
·
verified ·
1 Parent(s): 49170fb

Create 2.1_Counting_Columns

Browse files
Files changed (1) hide show
  1. 2.1_Counting_Columns +167 -0
2.1_Counting_Columns ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import pandas as pd
4
+ import statistics
5
+ from statistics import mode
6
+ from PIL import Image
7
+ import io
8
+ import pypdfium2 as pdfium
9
+ import fitz # PyMuPDF
10
+ import os
11
+
12
+ def get_text_from_pdf(input_pdf_path):
13
+ pdf_document = fitz.open(input_pdf_path)
14
+
15
+ for page_num in range(pdf_document.page_count):
16
+ page = pdf_document[page_num]
17
+ text_instances = page.get_text("words")
18
+
19
+ page.apply_redactions()
20
+ return text_instances
21
+
22
+ def convert2img(path):
23
+ pdf = pdfium.PdfDocument(path)
24
+ page = pdf.get_page(0)
25
+ pil_image = page.render().to_pil()
26
+ pl1=np.array(pil_image)
27
+ img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR)
28
+ return img
29
+
30
+ def changeWhiteColumns(img):
31
+ imgCopy = img.copy()
32
+ hsv = cv2.cvtColor(imgCopy, cv2.COLOR_BGR2HSV)
33
+ white_range_low = np.array([0,0,250])
34
+ white_range_high = np.array([0,0,255])
35
+ mask2=cv2.inRange(hsv,white_range_low, white_range_high)
36
+ imgCopy[mask2>0]=(255,0,0)
37
+ return imgCopy
38
+
39
+ def changeGrayModify(img):
40
+ #noGray = changeWhiteColumns(img)
41
+ hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
42
+
43
+ #gray_range_low = np.array([0,0,180])
44
+ #gray_range_high = np.array([0,0,240])
45
+
46
+ gray_range_low = np.array([0,0,175])
47
+ gray_range_high = np.array([0,0,199])
48
+
49
+ mask=cv2.inRange(hsv,gray_range_low,gray_range_high)
50
+ img[mask>0]=(255,0,0)
51
+ return img
52
+
53
+ def segment_blue(gray_changed):
54
+ hsv = cv2.cvtColor(gray_changed, cv2.COLOR_BGR2HSV)
55
+
56
+ lowerRange1 = np.array([120, 255, 255])
57
+ upperRange1 = np.array([179, 255, 255])
58
+ mask2 = cv2.inRange(hsv, lowerRange1, upperRange1)
59
+ imgResult3 = cv2.bitwise_and(gray_changed, gray_changed, mask=mask2)
60
+
61
+ return imgResult3
62
+
63
+ def segment_brown(img):
64
+ lowerRange1 = np.array([0, 9, 0])
65
+ upperRange1 = np.array([81, 255, 255])
66
+ hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
67
+ mask2 = cv2.inRange(hsv, lowerRange1, upperRange1)
68
+ imgResult3 = cv2.bitwise_and(img, img, mask=mask2)
69
+ return imgResult3
70
+
71
+ def threshold(imgResult3):
72
+ gaus4 = cv2.GaussianBlur(imgResult3, (3,3),9)
73
+ gray4 = cv2.cvtColor(gaus4, cv2.COLOR_BGR2GRAY)
74
+ outsu4 = cv2.threshold(gray4, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
75
+ return outsu4
76
+
77
+ def get_columns_info(outsu4, img):
78
+ mask_clmns = np.ones(img.shape[:2], dtype="uint8") * 255
79
+ mask_walls = np.ones(img.shape[:2], dtype="uint8") * 255
80
+ contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE)
81
+ p = [] #to save points of each contour
82
+ for i, cnt in enumerate(contours):
83
+ M = cv2.moments(cnt)
84
+ if M['m00'] != 0.0:
85
+ x1 = int(M['m10']/M['m00'])
86
+ y1 = int(M['m01']/M['m00'])
87
+
88
+ area = cv2.contourArea(cnt)
89
+ if area > (881.0*2):
90
+ perimeter = cv2.arcLength(cnt,True)
91
+ #print(perimeter)
92
+ cv2.drawContours(mask_walls, [cnt], -1, 0, -1)
93
+
94
+ if area < (881.0 * 2) and area > 90:
95
+ # maybe make it area < (881.0 * 1.5)
96
+ p.append((x1,y1))
97
+ #print(area)
98
+ cv2.drawContours(mask_clmns, [cnt], -1, 0, -1)
99
+ return p, mask_clmns, mask_walls
100
+
101
+ def getTextsPoints(x):
102
+ point_list = []
103
+ for h in x:
104
+ point_list.append((h[2],h[3]))
105
+ return point_list
106
+
107
+
108
+ def distance(point1, point2):
109
+ x1, y1 = point1
110
+ x2, y2 = point2
111
+ return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
112
+
113
+ def getNearestText(point_list, p):
114
+ nearbyy = []
115
+ dis = []
116
+ for i in range(len(p)):
117
+ nearest_point = min(point_list, key=lambda point: distance(point, p[i]))
118
+ dist = distance(nearest_point, p[i])
119
+ dis.append(dist)
120
+ if dist < 44:
121
+ nearbyy.append(nearest_point)
122
+ return nearbyy
123
+
124
+
125
+ def getColumnsTypes(nearbyy, x):
126
+ found_tuple = []
127
+ # Loop through the list of tuples
128
+ for i in range(len(nearbyy)):
129
+ for tpl in x:
130
+ if (tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]) and tpl[4].startswith("C"):
131
+ found_tuple.append(tpl[4])
132
+ return found_tuple
133
+
134
+ def generate_legend(found_tuple):
135
+ word_freq = {}
136
+ for word in found_tuple:
137
+ if word in word_freq:
138
+ word_freq[word] += 1
139
+ else:
140
+ word_freq[word] = 1
141
+ data = word_freq
142
+ df = pd.DataFrame(data.items(), columns=['Column Type', 'Count'])
143
+ return df
144
+
145
+ def mainfun(plan):
146
+ texts_from_pdf = get_text_from_pdf(plan)
147
+ img = convert2img(plan)
148
+ imgResult = segment_brown(img)
149
+ outsu = threshold(imgResult)
150
+ column_points,mask_clmns, mask_walls = get_columns_info(outsu, img)
151
+ if len(column_points) > 10:
152
+ # BROWN COLUMNS
153
+ text_points = getTextsPoints(texts_from_pdf)
154
+ nearby = getNearestText(text_points, column_points)
155
+ columns_types = getColumnsTypes(nearby, texts_from_pdf)
156
+ legend = generate_legend(columns_types)
157
+ else:
158
+ # BLUE COLUMNS
159
+ img_blue = changeGrayModify(img)
160
+ imgResult = segment_blue(img_blue)
161
+ outsu = threshold(imgResult)
162
+ column_points,mask_clmns, mask_walls = get_columns_info(outsu, img)
163
+ text_points = getTextsPoints(texts_from_pdf)
164
+ nearby = getNearestText(text_points, column_points)
165
+ columns_types = getColumnsTypes(nearby, texts_from_pdf)
166
+ legend = generate_legend(columns_types)
167
+ return legend