|
|
import cv2 |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import statistics |
|
|
from statistics import mode |
|
|
from PIL import Image |
|
|
import io |
|
|
import google_sheet_Legend |
|
|
import pypdfium2 as pdfium |
|
|
import fitz |
|
|
import os |
|
|
import random |
|
|
|
|
|
def get_text_from_pdf(input_pdf_path): |
|
|
pdf_document = fitz.open('pdf',input_pdf_path) |
|
|
|
|
|
for page_num in range(pdf_document.page_count): |
|
|
page = pdf_document[page_num] |
|
|
text_instances = page.get_text("words") |
|
|
|
|
|
page.apply_redactions() |
|
|
return text_instances |
|
|
|
|
|
def convert2img(path): |
|
|
pdf = pdfium.PdfDocument(path) |
|
|
page = pdf.get_page(0) |
|
|
pil_image = page.render().to_pil() |
|
|
pl1=np.array(pil_image) |
|
|
img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR) |
|
|
return img |
|
|
|
|
|
def changeWhiteColumns(img): |
|
|
imgCopy = img.copy() |
|
|
hsv = cv2.cvtColor(imgCopy, cv2.COLOR_BGR2HSV) |
|
|
white_range_low = np.array([0,0,250]) |
|
|
white_range_high = np.array([0,0,255]) |
|
|
mask2=cv2.inRange(hsv,white_range_low, white_range_high) |
|
|
imgCopy[mask2>0]=(255,0,0) |
|
|
return imgCopy |
|
|
|
|
|
def changeGrayModify(img): |
|
|
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) |
|
|
|
|
|
gray_range_low = np.array([0,0,175]) |
|
|
gray_range_high = np.array([0,0,199]) |
|
|
|
|
|
mask=cv2.inRange(hsv,gray_range_low,gray_range_high) |
|
|
img[mask>0]=(255,0,0) |
|
|
return img |
|
|
|
|
|
def segment_blue(gray_changed): |
|
|
hsv = cv2.cvtColor(gray_changed, cv2.COLOR_BGR2HSV) |
|
|
|
|
|
lowerRange1 = np.array([120, 255, 255]) |
|
|
upperRange1 = np.array([179, 255, 255]) |
|
|
mask2 = cv2.inRange(hsv, lowerRange1, upperRange1) |
|
|
imgResult3 = cv2.bitwise_and(gray_changed, gray_changed, mask=mask2) |
|
|
|
|
|
return imgResult3 |
|
|
|
|
|
def segment_brown(img): |
|
|
lowerRange1 = np.array([0, 9, 0]) |
|
|
upperRange1 = np.array([81, 255, 255]) |
|
|
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) |
|
|
mask2 = cv2.inRange(hsv, lowerRange1, upperRange1) |
|
|
imgResult3 = cv2.bitwise_and(img, img, mask=mask2) |
|
|
return imgResult3 |
|
|
|
|
|
def threshold(imgResult3): |
|
|
gaus4 = cv2.GaussianBlur(imgResult3, (3,3),9) |
|
|
gray4 = cv2.cvtColor(gaus4, cv2.COLOR_BGR2GRAY) |
|
|
outsu4 = cv2.threshold(gray4, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] |
|
|
return outsu4 |
|
|
|
|
|
def get_columns_info(outsu4, img): |
|
|
mask_clmns = np.ones(img.shape[:2], dtype="uint8") * 255 |
|
|
mask_walls = np.ones(img.shape[:2], dtype="uint8") * 255 |
|
|
contours, hierarchy = cv2.findContours(image=outsu4, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE) |
|
|
p = [] |
|
|
for i, cnt in enumerate(contours): |
|
|
M = cv2.moments(cnt) |
|
|
if M['m00'] != 0.0: |
|
|
x1 = int(M['m10']/M['m00']) |
|
|
y1 = int(M['m01']/M['m00']) |
|
|
|
|
|
area = cv2.contourArea(cnt) |
|
|
if area > (881.0*2): |
|
|
perimeter = cv2.arcLength(cnt,True) |
|
|
|
|
|
cv2.drawContours(mask_walls, [cnt], -1, 0, -1) |
|
|
|
|
|
if area < (881.0 * 2) and area > 90: |
|
|
|
|
|
p.append((x1,y1)) |
|
|
|
|
|
cv2.drawContours(mask_clmns, [cnt], -1, 0, -1) |
|
|
return p, mask_clmns, mask_walls |
|
|
|
|
|
def getTextsPoints(x): |
|
|
point_list = [] |
|
|
pt_clm = {} |
|
|
for h in x: |
|
|
point_list.append(calculate_midpoint(h[1],h[0],h[3],h[2])) |
|
|
pt_clm[calculate_midpoint(h[1],h[0],h[3],h[2])] = h[4] |
|
|
return point_list, pt_clm |
|
|
|
|
|
def fix_90_ky_val(pt_clm, derotationMatrix): |
|
|
new_derotated = {} |
|
|
for ky in pt_clm: |
|
|
pts = fitz.Point(ky[0], ky[1]) * derotationMatrix |
|
|
new_ky = ((int(pts.y),int(pts.x))) |
|
|
new_derotated[new_ky] = pt_clm[ky] |
|
|
return new_derotated |
|
|
|
|
|
def calculate_midpoint(x1,y1,x2,y2): |
|
|
xm = int((x1 + x2) / 2) |
|
|
ym = int((y1 + y2) / 2) |
|
|
return (xm, ym) |
|
|
|
|
|
def getColumnsTypesKeyValue(nearbyy, pt_clm): |
|
|
words = [] |
|
|
for i in range(len(nearbyy)): |
|
|
words.append(pt_clm[nearbyy[i]]) |
|
|
return words |
|
|
|
|
|
def fix_rotation_90(pc_coordinates, derotationMatrix): |
|
|
coor = [] |
|
|
for coordinate in pc_coordinates: |
|
|
pts = fitz.Point(coordinate[0], coordinate[1]) * derotationMatrix |
|
|
coor.append((int(pts.y),int(pts.x))) |
|
|
return coor |
|
|
|
|
|
def distance(point1, point2): |
|
|
x1, y1 = point1 |
|
|
x2, y2 = point2 |
|
|
return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) |
|
|
|
|
|
def getNearestText(point_list, p): |
|
|
nearbyy = [] |
|
|
selected_clm_point = [] |
|
|
dis = [] |
|
|
txt_clmn = [] |
|
|
for i in range(len(p)): |
|
|
nearest_point = min(point_list, key=lambda point: distance(point, p[i])) |
|
|
dist = distance(nearest_point, p[i]) |
|
|
dis.append(dist) |
|
|
if dist < 44: |
|
|
nearbyy.append(nearest_point) |
|
|
selected_clm_point.append(p[i]) |
|
|
txt_clmn.append((nearest_point, p[i])) |
|
|
return nearbyy, selected_clm_point, txt_clmn |
|
|
|
|
|
|
|
|
def getColumnsTypes(nearbyy, x): |
|
|
found_tuple = [] |
|
|
|
|
|
for i in range(len(nearbyy)): |
|
|
for tpl in x: |
|
|
if (tpl[2] == nearbyy[i][0] and tpl[3] == nearbyy[i][1]) and tpl[4].startswith("C"): |
|
|
found_tuple.append(tpl[4]) |
|
|
return found_tuple |
|
|
|
|
|
def generate_legend(found_tuple): |
|
|
word_freq = {} |
|
|
for word in found_tuple: |
|
|
if word in word_freq: |
|
|
word_freq[word] += 1 |
|
|
else: |
|
|
word_freq[word] = 1 |
|
|
data = word_freq |
|
|
df = pd.DataFrame(data.items(), columns=['Column Type', 'Count']) |
|
|
return df |
|
|
|
|
|
def color_groups(txtpts_ky_vlu): |
|
|
unique_labels = list(set(txtpts_ky_vlu.values())) |
|
|
def generate_rgb(): |
|
|
return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) |
|
|
key_colors = {key: generate_rgb() for key in unique_labels} |
|
|
return key_colors |
|
|
|
|
|
def get_drawing_info(txt_clmn,txtpts_ky_vlu,key_colors): |
|
|
|
|
|
huge_list_clmn_clr_loc = [] |
|
|
for text_location, column_location in txt_clmn: |
|
|
word = txtpts_ky_vlu[text_location] |
|
|
huge_list_clmn_clr_loc.append((text_location, column_location, word, key_colors[word])) |
|
|
return huge_list_clmn_clr_loc |
|
|
'''def add_annotations_to_pdf(image, pdf_name, slctd_clm, columns_types_v): |
|
|
image_width = image.shape[1] |
|
|
image_height = image.shape[0] |
|
|
# Create a new PDF document |
|
|
pdf_document = fitz.open('pdf',pdf_name) |
|
|
page=pdf_document[0] |
|
|
rotationOld=page.rotation |
|
|
derotationMatrix=page.derotation_matrix |
|
|
if page.rotation!=0: |
|
|
rotationangle = page.rotation |
|
|
page.set_rotation(0) |
|
|
for i in range(len(slctd_clm)): |
|
|
x, y = slctd_clm[i] |
|
|
p_midpoint = fitz.Point(x, y) * derotationMatrix |
|
|
text = columns_types_v[i] |
|
|
# Create an annotation (sticky note) |
|
|
annot = page.add_text_annot((p_midpoint.x, p_midpoint.y), text) |
|
|
annot.set_border(width=0.2, dashes=(1, 2)) # Optional border styling |
|
|
annot.set_colors(stroke=(1, 0, 0), fill=None) # Set the stroke color to red |
|
|
annot.update() |
|
|
page.set_rotation(rotationOld) |
|
|
return pdf_document''' |
|
|
|
|
|
def add_annotations_to_pdf(image, pdf_name, huge_list_clmn_clr_loc): |
|
|
image_width = image.shape[1] |
|
|
image_height = image.shape[0] |
|
|
|
|
|
pdf_document = fitz.open('pdf',pdf_name) |
|
|
page=pdf_document[0] |
|
|
rotationOld=page.rotation |
|
|
derotationMatrix=page.derotation_matrix |
|
|
if page.rotation!=0: |
|
|
rotationangle = page.rotation |
|
|
page.set_rotation(0) |
|
|
|
|
|
for text_loc, column_loc, word, clr in huge_list_clmn_clr_loc: |
|
|
x, y = column_loc |
|
|
clr = (clr[0] / 255, clr[1] / 255, clr[2] / 255) |
|
|
|
|
|
p_midpoint = fitz.Point(x, y) * derotationMatrix |
|
|
annot = page.add_circle_annot( |
|
|
fitz.Rect(p_midpoint.x - 10, p_midpoint.y - 10, p_midpoint.x + 10,p_midpoint.y + 10) |
|
|
) |
|
|
|
|
|
annot.set_colors(stroke=clr, fill=(1, 1, 1)) |
|
|
annot.set_border(width=2) |
|
|
annot.set_opacity(1) |
|
|
|
|
|
|
|
|
annot.set_info("name", word) |
|
|
annot.set_info("subject", "Count") |
|
|
annot.set_info("title", word) |
|
|
annot.update() |
|
|
page.set_rotation(rotationOld) |
|
|
return pdf_document |
|
|
|
|
|
def mainfun(pdf_name,pdfpath,planname): |
|
|
pdf_document = fitz.open('pdf',pdf_name) |
|
|
page = pdf_document[0] |
|
|
rotation = page.rotation |
|
|
derotationMatrix=page.derotation_matrix |
|
|
texts_from_pdf = get_text_from_pdf(pdf_name) |
|
|
text_points, txtpts_ky_vlu = getTextsPoints(texts_from_pdf) |
|
|
if rotation != 0: |
|
|
if rotation ==90: |
|
|
text_points = fix_rotation_90(text_points, derotationMatrix) |
|
|
txtpts_ky_vlu = fix_90_ky_val(txtpts_ky_vlu, derotationMatrix) |
|
|
|
|
|
img = convert2img(pdf_name) |
|
|
imgResult = segment_brown(img) |
|
|
outsu = threshold(imgResult) |
|
|
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img) |
|
|
key_colors = color_groups(txtpts_ky_vlu) |
|
|
|
|
|
if len(column_points) > 10: |
|
|
|
|
|
nearby, slctd_clm, txt_clmn = getNearestText(text_points, column_points) |
|
|
columns_types_v = getColumnsTypesKeyValue(nearby, txtpts_ky_vlu) |
|
|
legend = generate_legend(columns_types_v) |
|
|
huge_list_clmn_clr_loc = get_drawing_info(txt_clmn,txtpts_ky_vlu,key_colors) |
|
|
|
|
|
else: |
|
|
|
|
|
img_blue = changeGrayModify(img) |
|
|
imgResult = segment_blue(img_blue) |
|
|
outsu = threshold(imgResult) |
|
|
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img) |
|
|
nearby, slctd_clm, txt_clmn = getNearestText(text_points, column_points) |
|
|
columns_types_v = getColumnsTypesKeyValue(nearby, txtpts_ky_vlu) |
|
|
legend = generate_legend(columns_types_v) |
|
|
huge_list_clmn_clr_loc = get_drawing_info(txt_clmn,txtpts_ky_vlu,key_colors) |
|
|
|
|
|
pdf_document = add_annotations_to_pdf(img, pdf_name, huge_list_clmn_clr_loc) |
|
|
page=pdf_document[0] |
|
|
pix = page.get_pixmap() |
|
|
pl=Image.frombytes('RGB', [pix.width,pix.height],pix.samples) |
|
|
img=np.array(pl) |
|
|
annotatedimg = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) |
|
|
|
|
|
legend = legend.fillna(' ') |
|
|
gc,spreadsheet_service,spreadsheetId, spreadsheet_url , namepathArr=google_sheet_Legend.legendGoogleSheets(legend , planname,pdfpath) |
|
|
list1=pd.DataFrame(columns=['content', 'id', 'subject','color']) |
|
|
for page in pdf_document: |
|
|
for annot in page.annots(): |
|
|
annot_color = annot.colors |
|
|
if annot_color is not None: |
|
|
stroke_color = annot_color.get('stroke') |
|
|
print('strokeee',stroke_color) |
|
|
if stroke_color: |
|
|
v='stroke' |
|
|
list1.loc[len(list1)] =[annot.info['content'],annot.info['id'],annot.info['subject'],[255,0,0]] |
|
|
|
|
|
|
|
|
print('list1',list1) |
|
|
return annotatedimg, pdf_document , spreadsheet_url, list1, legend |
|
|
|
|
|
'''def mainfun(plan): |
|
|
texts_from_pdf = get_text_from_pdf(plan) |
|
|
img = convert2img(plan) |
|
|
imgResult = segment_brown(img) |
|
|
outsu = threshold(imgResult) |
|
|
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img) |
|
|
if len(column_points) > 10: |
|
|
# BROWN COLUMNS |
|
|
text_points = getTextsPoints(texts_from_pdf) |
|
|
nearby = getNearestText(text_points, column_points) |
|
|
if rotation != 0: |
|
|
if rotation ==90: |
|
|
nearby = fix_rotation_90(pc_coordinates) |
|
|
columns_types = getColumnsTypes(nearby, texts_from_pdf) |
|
|
legend = generate_legend(columns_types) |
|
|
else: |
|
|
# BLUE COLUMNS |
|
|
img_blue = changeGrayModify(img) |
|
|
imgResult = segment_blue(img_blue) |
|
|
outsu = threshold(imgResult) |
|
|
column_points,mask_clmns, mask_walls = get_columns_info(outsu, img) |
|
|
text_points = getTextsPoints(texts_from_pdf) |
|
|
nearby = getNearestText(text_points, column_points) |
|
|
if rotation != 0: |
|
|
if rotation ==90: |
|
|
nearby = fix_rotation_90(pc_coordinates) |
|
|
columns_types = getColumnsTypes(nearby, texts_from_pdf) |
|
|
legend = generate_legend(columns_types) |
|
|
return legend''' |
|
|
|
|
|
|