Spaces:
Sleeping
Sleeping
File size: 9,343 Bytes
e5cf808 cccfd94 e5cf808 86f6cb5 e5cf808 249e9ba e5cf808 249e9ba 68a1f3c 249e9ba e5cf808 249e9ba e5cf808 283e6f5 e5cf808 283e6f5 e5cf808 86f6cb5 e5cf808 e221568 2e4290e 2b1f8fa 3cedb1f e221568 3cedb1f e221568 c562e63 e221568 3cedb1f e221568 538102d 7298db3 249e9ba b3e31f7 249e9ba 68a1f3c 249e9ba b3e31f7 249e9ba 283e6f5 249e9ba 283e6f5 249e9ba ed3a60c e8b5bb9 538102d e8b5bb9 6bbfa38 e8b5bb9 249e9ba e5cf808 d9d91e4 86f6cb5 e5cf808 d9d91e4 86f6cb5 e5cf808 249e9ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
import cv2
import numpy as np
import pandas as pd
import statistics
from statistics import mode
from PIL import Image
import io
import google_sheet_Legend
import pypdfium2 as pdfium
import fitz # PyMuPDF
import os
def get_text_from_pdf(input_pdf_path):
pdf_document = fitz.open('pdf',input_pdf_path)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
text_instances = page.get_text("words")
page.apply_redactions()
return text_instances
def convert2img(path):
pdf = pdfium.PdfDocument(path)
page = pdf.get_page(0)
pil_image = page.render().to_pil()
pl1=np.array(pil_image)
img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR)
return img
def changeWhiteColumns(img):
imgCopy = img.copy()
hsv = cv2.cvtColor(imgCopy, cv2.COLOR_BGR2HSV)
white_range_low = np.array([0,0,250])
white_range_high = np.array([0,0,255])
mask2=cv2.inRange(hsv,white_range_low, white_range_high)
imgCopy[mask2>0]=(255,0,0)
return imgCopy
def changeGrayModify(img):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
gray_range_low = np.array([0,0,175])
gray_range_high = np.array([0,0,199])
mask=cv2.inRange(hsv,gray_range_low,gray_range_high)
img[mask>0]=(255,0,0)
return img
def segment_blue(gray_changed):
hsv = cv2.cvtColor(gray_changed, cv2.COLOR_BGR2HSV)
lowerRange1 = np.array([120, 255, 255])
upperRange1 = np.array([179, 255, 255])
mask2 = cv2.inRange(hsv, lowerRange1, upperRange1)
imgResult3 = cv2.bitwise_and(gray_changed, gray_changed, mask=mask2)
return imgResult3
def segment_brown(img):
lowerRange1 = np.array([0, 9, 0])
upperRange1 = np.array([81, 255, 255])
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
mask2 = cv2.inRange(hsv, lowerRange1, upperRange1)
imgResult3 = cv2.bitwise_and(img, img, mask=mask2)
return imgResult3
def threshold(imgResult3):
gaus4 = cv2.GaussianBlur(imgResult3, (3,3),9)
gray4 = cv2.cvtColor(gaus4, cv2.COLOR_BGR2GRAY)
outsu4 = cv2.threshold(gray4, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
return outsu4
def get_columns_info(outsu4, img):
mask_clmns = np.ones(img.shape[:2], dtype="uint8") * 255
mask_walls = np.ones(img.shape[:2], dtype="uint8") * 255
contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE)
p = [] #to save points of each contour
for i, cnt in enumerate(contours):
M = cv2.moments(cnt)
if M['m00'] != 0.0:
x1 = int(M['m10']/M['m00'])
y1 = int(M['m01']/M['m00'])
area = cv2.contourArea(cnt)
if area > (881.0*2):
perimeter = cv2.arcLength(cnt,True)
#print(perimeter)
cv2.drawContours(mask_walls, [cnt], -1, 0, -1)
if area < (881.0 * 2) and area > 90:
# maybe make it area < (881.0 * 1.5)
p.append((x1,y1))
#print(area)
cv2.drawContours(mask_clmns, [cnt], -1, 0, -1)
return p, mask_clmns, mask_walls
def getTextsPoints(x):
point_list = []
pt_clm = {}
for h in x:
point_list.append(calculate_midpoint(h[1],h[0],h[3],h[2]))
pt_clm[calculate_midpoint(h[1],h[0],h[3],h[2])] = h[4]
return point_list, pt_clm
def fix_90_ky_val(pt_clm, derotationMatrix):
new_derotated = {}
for ky in pt_clm:
pts = fitz.Point(ky[0], ky[1]) * derotationMatrix
new_ky = ((int(pts.y),int(pts.x)))
new_derotated[new_ky] = pt_clm[ky]
return new_derotated
def calculate_midpoint(x1,y1,x2,y2):
xm = int((x1 + x2) / 2)
ym = int((y1 + y2) / 2)
return (xm, ym)
def getColumnsTypesKeyValue(nearbyy, pt_clm):
words = []
for i in range(len(nearbyy)):
words.append(pt_clm[nearbyy[i]])
return words
def fix_rotation_90(pc_coordinates, derotationMatrix):
coor = []
for coordinate in pc_coordinates:
pts = fitz.Point(coordinate[0], coordinate[1]) * derotationMatrix
coor.append((int(pts.y),int(pts.x)))
return coor
def distance(point1, point2):
x1, y1 = point1
x2, y2 = point2
return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
def getNearestText(point_list, p):
nearbyy = []
selected_clm_point = [] #save the clmn for drawing cirlce on it
dis = []
for i in range(len(p)):
nearest_point = min(point_list, key=lambda point: distance(point, p[i]))
dist = distance(nearest_point, p[i])
dis.append(dist)
if dist < 44:
nearbyy.append(nearest_point)
selected_clm_point.append(p[i])
return nearbyy, selected_clm_point
def getColumnsTypes(nearbyy, x):
found_tuple = []
# Loop through the list of tuples
for i in range(len(nearbyy)):
for tpl in x:
if (tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]) and tpl[4].startswith("C"):
found_tuple.append(tpl[4])
return found_tuple
def generate_legend(found_tuple):
word_freq = {}
for word in found_tuple:
if word in word_freq:
word_freq[word] += 1
else:
word_freq[word] = 1
data = word_freq
df = pd.DataFrame(data.items(), columns=['Column Type', 'Count'])
return df
def add_annotations_to_pdf(image, pdf_name, slctd_clm, columns_types_v):
image_width = image.shape[1]
image_height = image.shape[0]
# Create a new PDF document
pdf_document = fitz.open('pdf',pdf_name)
page=pdf_document[0]
rotationOld=page.rotation
derotationMatrix=page.derotation_matrix
if page.rotation!=0:
rotationangle = page.rotation
page.set_rotation(0)
for i in range(len(slctd_clm)):
x, y = slctd_clm[i]
p_midpoint = fitz.Point(x, y) * derotationMatrix
text = columns_types_v[i]
# Create an annotation (sticky note)
annot = page.add_text_annot((p_midpoint.x, p_midpoint.y), text)
annot.set_border(width=0.2, dashes=(1, 2)) # Optional border styling
annot.set_colors(stroke=(1, 0, 0), fill=None) # Set the stroke color to red
annot.update()
page.set_rotation(rotationOld)
return pdf_document
def mainfun(pdf_name,pdfpath,planname):
pdf_document = fitz.open('pdf',pdf_name)
page = pdf_document[0]
rotation = page.rotation
derotationMatrix=page.derotation_matrix
texts_from_pdf = get_text_from_pdf(pdf_name)
text_points, txtpts_ky_vlu = getTextsPoints(texts_from_pdf)
if rotation != 0:
if rotation ==90:
text_points = fix_rotation_90(text_points, derotationMatrix)
txtpts_ky_vlu = fix_90_ky_val(txtpts_ky_vlu, derotationMatrix)
img = convert2img(pdf_name)
imgResult = segment_brown(img)
outsu = threshold(imgResult)
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img)
if len(column_points) > 10:
# BROWN COLUMNS
nearby, slctd_clm = getNearestText(text_points, column_points)
columns_types_v = getColumnsTypesKeyValue(nearby, txtpts_ky_vlu)
legend = generate_legend(columns_types_v)
else:
# BLUE COLUMNS
img_blue = changeGrayModify(img)
imgResult = segment_blue(img_blue)
outsu = threshold(imgResult)
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img)
nearby, slctd_clm = getNearestText(text_points, column_points)
columns_types_v = getColumnsTypesKeyValue(nearby, txtpts_ky_vlu)
legend = generate_legend(columns_types_v)
pdf_document = add_annotations_to_pdf(img, pdf_name, slctd_clm, columns_types_v)
page=pdf_document[0]
pix = page.get_pixmap() # render page to an image
pl=Image.frombytes('RGB', [pix.width,pix.height],pix.samples)
img=np.array(pl)
annotatedimg = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
legend = legend.fillna(' ')
gc,spreadsheet_service,spreadsheetId, spreadsheet_url , namepathArr=google_sheet_Legend.legendGoogleSheets(legend , planname,pdfpath)
list1=pd.DataFrame(columns=['content', 'id', 'subject','color'])
for page in pdf_document:
for annot in page.annots():
annot_color = annot.colors
if annot_color is not None:
stroke_color = annot_color.get('stroke') # Border color
print('strokeee',stroke_color)
if stroke_color:
v='stroke'
list1.loc[len(list1)] =[annot.info['content'],annot.info['id'],annot.info['subject'],[255,0,0]]
print('list1',list1)
return annotatedimg, pdf_document , spreadsheet_url, list1, legend
'''def mainfun(plan):
texts_from_pdf = get_text_from_pdf(plan)
img = convert2img(plan)
imgResult = segment_brown(img)
outsu = threshold(imgResult)
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img)
if len(column_points) > 10:
# BROWN COLUMNS
text_points = getTextsPoints(texts_from_pdf)
nearby = getNearestText(text_points, column_points)
if rotation != 0:
if rotation ==90:
nearby = fix_rotation_90(pc_coordinates)
columns_types = getColumnsTypes(nearby, texts_from_pdf)
legend = generate_legend(columns_types)
else:
# BLUE COLUMNS
img_blue = changeGrayModify(img)
imgResult = segment_blue(img_blue)
outsu = threshold(imgResult)
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img)
text_points = getTextsPoints(texts_from_pdf)
nearby = getNearestText(text_points, column_points)
if rotation != 0:
if rotation ==90:
nearby = fix_rotation_90(pc_coordinates)
columns_types = getColumnsTypes(nearby, texts_from_pdf)
legend = generate_legend(columns_types)
return legend''' |