Marthee commited on
Commit
bf5d8d3
·
verified ·
1 Parent(s): 49c1df5

Upload Doors_Schedule.py

Browse files
Files changed (1) hide show
  1. Doors_Schedule.py +472 -0
Doors_Schedule.py ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import pandas as pd
3
+ import random
4
+ import re
5
+ import io
6
+ import pypdfium2 as pdfium
7
+ import fitz
8
+ from PIL import Image, ImageDraw
9
+ from PyPDF2 import PdfReader, PdfWriter
10
+ from PyPDF2.generic import TextStringObject, NameObject, ArrayObject, FloatObject
11
+ from PyPDF2.generic import NameObject, TextStringObject, DictionaryObject, FloatObject, ArrayObject
12
+ from PyPDF2 import PdfReader
13
+ from PyPDF2.generic import TextStringObject
14
+ import numpy as np
15
+ import cv2
16
+
17
+
18
+ def convert2img(path):
19
+ pdf = pdfium.PdfDocument(path)
20
+ page = pdf.get_page(0)
21
+ pil_image = page.render().to_pil()
22
+ pl1=np.array(pil_image)
23
+ img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR)
24
+ return img
25
+
26
+ def convert2pillow(path):
27
+ pdf = pdfium.PdfDocument(path)
28
+ page = pdf.get_page(0)
29
+ pil_image = page.render().to_pil()
30
+ return pil_image
31
+
32
+ def calculate_midpoint(x1,y1,x2,y2):
33
+ xm = int((x1 + x2) / 2)
34
+ ym = int((y1 + y2) / 2)
35
+ return (xm, ym)
36
+
37
+ def read_text(input_pdf_path):
38
+ pdf_document = fitz.open('pdf',input_pdf_path)
39
+
40
+ for page_num in range(pdf_document.page_count):
41
+ page = pdf_document[page_num]
42
+ text_instances = page.get_text("words")
43
+
44
+ page.apply_redactions()
45
+ return text_instances
46
+
47
+ def search_columns(df):
48
+ import pandas as pd
49
+ import re
50
+
51
+ # Define patterns
52
+
53
+ door_id_pattern = r'\b(?:door\s*)?(?:id|no|number)(?!-)\b'
54
+ door_type_pattern = r'^\s*(?:\S*\s+)?door\s*[\n\s]*type\s*$|^type\s*$'
55
+ width_pattern = r'^\s*(?:WIDTH|Width|width)\s*$'
56
+ height_pattern = r'^\s*(?:HEIGHT|Height|height)\s*$'
57
+ structural_opening_pattern = r'\b(?:Structural\s+opening|structural\s+opening)\b'
58
+
59
+ # Function to search in column names and return column indices
60
+ def find_column_indices(df, patterns):
61
+ matches = {}
62
+ for key, pattern in patterns.items():
63
+ indices = [i for i, col in enumerate(df.columns) if re.search(pattern, col, re.IGNORECASE)]
64
+ if indices:
65
+ matches[key] = indices # Store column index if found
66
+ return matches
67
+
68
+ # Function to search in cells and return (row index, column index) pairs
69
+ def find_matches_in_cells(df, patterns):
70
+ matches = {}
71
+ for key, pattern in patterns.items():
72
+ found = []
73
+ for row_idx in range(min(2, len(df))): # Limit to the first two rows
74
+ for col_idx in range(len(df.columns)):
75
+ cell = df.iat[row_idx, col_idx]
76
+ if isinstance(cell, str) and re.search(pattern, cell, re.IGNORECASE):
77
+ found.append((row_idx, col_idx)) # Store (row index, column index)
78
+ if found:
79
+ matches[key] = found # Store if any matches are found
80
+ return matches
81
+
82
+ # Search in column names first
83
+ patterns = {
84
+ "door_id": door_id_pattern,
85
+ "door_type": door_type_pattern,
86
+ "width": width_pattern,
87
+ "height": height_pattern
88
+ }
89
+ column_matches = find_column_indices(df, patterns)
90
+
91
+ # If door_id and door_type are NOT found in column names, search in cells
92
+ if "door_id" not in column_matches and "door_type" not in column_matches:
93
+ cell_matches = find_matches_in_cells(df, {"door_id": door_id_pattern, "door_type": door_type_pattern})
94
+ column_matches.update(cell_matches) # Merge results
95
+
96
+ # If width and height are NOT found in column names, search for them in cells
97
+ if "width" not in column_matches and "height" not in column_matches:
98
+ cell_matches = find_matches_in_cells(df, {"width": width_pattern, "height": height_pattern})
99
+ column_matches.update(cell_matches) # Merge results
100
+
101
+ # If width and height are still NOT found, search for structural opening in column names
102
+ if "width" not in column_matches or "height" not in column_matches:
103
+ structural_opening_match = find_column_indices(df, {"structural opening": structural_opening_pattern})
104
+ column_matches.update(structural_opening_match)
105
+
106
+ # If structural opening is also NOT found in column names, search in cells
107
+ if "structural opening" not in column_matches:
108
+ structural_opening_match = find_matches_in_cells(df, {"structural opening": structural_opening_pattern})
109
+ column_matches.update(structural_opening_match)
110
+
111
+ # Print results
112
+ #print(column_matches)
113
+ return column_matches
114
+
115
+ def row_clmn_indices(column_matches):
116
+ clm_idx = []
117
+ starting_row_index = []
118
+ for key in column_matches.keys():
119
+ if type(column_matches[key][0]) == tuple:
120
+ clm_idx.append((key,column_matches[key][0][1]))
121
+ starting_row_index.append(column_matches[key][0][0])
122
+ else:
123
+ clm_idx.append((key,column_matches[key][0]))
124
+ return clm_idx, starting_row_index
125
+
126
+
127
+ def generate_current_table_without_cropping(clm_idx,df):
128
+ selected_df = df.iloc[:, clm_idx]
129
+ print("hello I generated the selected columns table without cropping")
130
+ return selected_df
131
+
132
+ def column_name_index(clm_idx):
133
+ clmn_name = []
134
+ clmn_idx = []
135
+ for indd in clm_idx:
136
+ cl_nm, cl_idx = indd
137
+ clmn_name.append(cl_nm)
138
+ clmn_idx.append(cl_idx)
139
+ return clmn_name, clmn_idx
140
+
141
+ def crop_rename_table(indices, clmn_name, clmn_idx,df):
142
+ #crop_at = (max(set(indices), key=indices.count)) + 1
143
+ crop_at = max(indices) + 1
144
+
145
+ df = df.iloc[crop_at:] # Starts from row index 5 (zero-based index)
146
+ df.reset_index(drop=True, inplace=True) # Reset index after cropping
147
+
148
+
149
+ slctd_clms = df.iloc[:, clmn_idx] # Select columns by index
150
+ slctd_clms.columns = clmn_name # Rename selected columns
151
+
152
+ return slctd_clms
153
+
154
+ def details_in_another_table(clmn_name, clmn_idx, current_dfs, dfs):
155
+ for dff in dfs:
156
+ if dff.shape[1] == current_dfs.shape[1]:
157
+ df = dff
158
+ # Create a new DataFrame with selected columns
159
+ new_df = df.iloc[:, clmn_idx].copy() # Use .copy() to avoid modifying original df
160
+ column_names_row = pd.DataFrame([new_df.columns], columns=new_df.columns)
161
+
162
+ # Append the original data below the column names row
163
+ new_df = pd.concat([column_names_row, new_df], ignore_index=True)
164
+
165
+ # Rename columns
166
+ new_df.columns = clmn_name
167
+ return new_df
168
+
169
+ def extract_tables(schedule):
170
+ doc = fitz.open("pdf",schedule)
171
+ for page in doc:
172
+ tabs = page.find_tables()
173
+ dfs = []
174
+ for tab in tabs:
175
+ df = tab.to_pandas()
176
+ dfs.append(df)
177
+ return dfs
178
+
179
+ def get_selected_columns(dfs):
180
+ selected_columns = []
181
+ for i in range(len(dfs)):
182
+ column_matches = search_columns(dfs[i])
183
+ clm_idx, starting_row_index = row_clmn_indices(column_matches)
184
+ clmn_name, clmn_idx = column_name_index(clm_idx)
185
+ if len(clm_idx) == 0 and len(starting_row_index) == 0:
186
+ print(f"this is df {i}, SEARCH IN ANOTHER DF")
187
+ else:
188
+ #MIX
189
+ if (len(clm_idx) != len(starting_row_index)) and len(starting_row_index) > 0:
190
+ print(f"this is df {i} MIX, search in another df but make sure of the length")
191
+
192
+ #IN COLUMNS
193
+ if len(starting_row_index) == 0:
194
+ print(f"this is df {i} mawgooda fel columns, check el df length 3ashan law el details fe table tany")
195
+ #details in another table
196
+ if len(dfs[i]) <10:
197
+ selected_columns_new = details_in_another_table(clmn_name, clmn_idx, dfs[i], dfs)
198
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
199
+ #details in the same table
200
+ if len(dfs[i]) >10:
201
+ selected_columns_new = generate_current_table_without_cropping(clmn_idx,dfs[i])
202
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
203
+
204
+ #IN CELLS
205
+ if len(starting_row_index) == len(clm_idx):
206
+ print(f"this is df {i} mawgooda fel cells, check el df length 3ashan law el details fe table tany")
207
+
208
+ #details in another table
209
+ if len(dfs[i]) <10:
210
+ selected_columns_new = details_in_another_table(clmn_name, clmn_idx, dfs[i], dfs)
211
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
212
+ #details in the same table
213
+ if len(dfs[i]) >10:
214
+ print(f"this is df {i} call crop_rename_table(indices, clmn_name, clmn_idx,df)")
215
+ selected_columns_new = crop_rename_table(starting_row_index, clmn_name, clmn_idx,dfs[i])
216
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
217
+ return selected_columns
218
+
219
+ def get_st_op_pattern(clm_idx, clmn_name, starting_row_index, df):
220
+ target = 'structural opening'
221
+ clm_dict = dict(clm_idx) # Convert list of tuples to dictionary
222
+ structural_opening_value = clm_dict.get(target) # Returns None if not found
223
+
224
+ if target in clmn_name:
225
+ position = clmn_name.index(target)
226
+
227
+ kelma = df.iloc[starting_row_index[position], structural_opening_value]
228
+ return kelma
229
+
230
+ def get_similar_colors(selected_columns_new):
231
+ def generate_rgb():
232
+ return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) # RGB tuple
233
+
234
+ unique_keys = selected_columns_new['door_type'].unique()
235
+ key_colors = {key: generate_rgb() for key in unique_keys} # Assign a unique RGB color to each key
236
+
237
+ # Create dictionary storing values, colors, and widths
238
+ col_dict = defaultdict(lambda: {'values': [], 'color': None, 'widths': []})
239
+
240
+ for _, row in selected_columns_new.iterrows():
241
+ key = row['door_type']
242
+ col_dict[key]['values'].append(row['door_id'])
243
+ col_dict[key]['widths'].append(row['structural opening']) # Add structural opening
244
+ col_dict[key]['color'] = key_colors[key] # Assign the unique RGB color
245
+
246
+ # Convert defaultdict to a normal dictionary
247
+ col_dict = dict(col_dict)
248
+ return col_dict
249
+
250
+ def get_flattened_tuples_list(col_dict):
251
+ tuples_list = []
252
+ for key in col_dict.keys():
253
+ tuples_list.append([(value, width, col_dict[key]["color"]) for value, width in zip(col_dict[key]['values'], col_dict[key]['widths'])])
254
+ flattened_list = [item for sublist in tuples_list for item in sublist]
255
+ return flattened_list
256
+
257
+ def find_text_in_plan(label, x):
258
+ substring_coordinates = []
259
+ words = []
260
+ point_list = []
261
+ #None, None, None
262
+ for tpl in x:
263
+ if tpl[4] == label:
264
+ substring_coordinates.append(calculate_midpoint(tpl[0],tpl[1],tpl[2],tpl[3]))# for pdf
265
+ point_list.append(calculate_midpoint(tpl[1],tpl[0],tpl[3],tpl[2]))# for rotated
266
+ words.append(tpl[4])
267
+ return substring_coordinates, words, point_list
268
+
269
+ def get_word_locations_plan(flattened_list, plan_texts):
270
+ locations = []
271
+ not_found = []
272
+ for lbl, w, clr in flattened_list:
273
+ location,worz, txt_pt = find_text_in_plan(lbl, plan_texts)
274
+ if len(location) ==0:
275
+ not_found.append(lbl)
276
+ locations.append((location, lbl, clr, w))
277
+ return locations, not_found
278
+
279
+ def get_repeated_labels(locations):
280
+ seen_labels = set()
281
+ repeated_labels = set()
282
+
283
+ for item in locations:
284
+ label = item[1]
285
+ if label in seen_labels:
286
+ repeated_labels.add(label)
287
+ else:
288
+ seen_labels.add(label)
289
+ return repeated_labels
290
+
291
+ def get_cleaned_data(locations):
292
+ processed = defaultdict(int)
293
+
294
+ new_data = []
295
+ for coords, label, color, w in locations:
296
+ if len(coords)>1:
297
+ index = processed[label] % len(coords) # Round-robin indexing
298
+ new_coord = [coords[index]] # Pick the correct coordinate
299
+ new_data.append((new_coord, label, color, w))
300
+ processed[label] += 1 # Move to the next coordinate for this label
301
+ if len(coords)==1:
302
+ new_data.append((coords, label, color, w))
303
+ return new_data
304
+
305
+ def get_width_info_tobeprinted(new_data):
306
+ width_info_tobeprinted = []
307
+ for _,_,_, w in new_data:
308
+ width_info_tobeprinted.append(w)
309
+ return width_info_tobeprinted
310
+
311
+ def clean_dimensions(text):
312
+ # Remove commas and "mm"
313
+ text = re.sub(r'[,\s]*mm', '', text) # Remove "mm" with optional spaces or commas before it
314
+ text = text.replace(",", "") # Remove remaining commas if any
315
+ return text
316
+
317
+ def get_cleaned_width(width_info_tobeprinted):
318
+ cleaned_width = []
319
+ for w in width_info_tobeprinted:
320
+ cleaned_width.append(clean_dimensions(w))
321
+ return cleaned_width
322
+
323
+ def get_widths_bb_format(cleaned_width, kelma):
324
+ pattern = r"\bW(?:idth)?\s*[×x]\s*H(?:eight)?\b"
325
+ match = re.search(pattern, kelma)
326
+ widths = []
327
+ for widthaa in cleaned_width:
328
+ index = max(widthaa.find("x"), widthaa.find("×"), widthaa.find("x"), widthaa.find("X"), widthaa.find("x"))
329
+ width_name = widthaa[:index]
330
+ height_name = widthaa[index+1:]
331
+ if match:
332
+ full_text = f"{width_name}mm wide x {height_name}mm high"
333
+ else:
334
+ full_text = f"{height_name}mm wide x {width_name}mm high"
335
+ widths.append(full_text)
336
+ return widths
337
+
338
+ import fitz # PyMuPDF
339
+ import PyPDF2
340
+ import io
341
+ from PyPDF2.generic import TextStringObject # ✅ Required for setting string values
342
+
343
+ def add_bluebeam_count_annotations(pdf_bytes, locations):
344
+ pdf_stream = io.BytesIO(pdf_bytes) # Load PDF from bytes
345
+ pdf_document = fitz.open("pdf", pdf_stream.read()) # Open PDF in memory
346
+
347
+ page = pdf_document[0] # First page
348
+ for loc in locations:
349
+ coor, lbl, clr,w = loc
350
+ clr = (clr[0] / 255, clr[1] / 255, clr[2] / 255)
351
+ for cor in coor:
352
+ #Create a Circle annotation (Count Markup)
353
+ annot = page.add_circle_annot(
354
+ fitz.Rect(cor[0] - 10, cor[1] - 10, cor[0] + 10, cor[1] + 10) # Small circle
355
+ )
356
+
357
+ #Assign required Bluebeam metadata
358
+ annot.set_colors(stroke=clr, fill=(1, 1, 1)) # Set stroke color and fill white
359
+ annot.set_border(width=2) # Border thickness
360
+ annot.set_opacity(1) # Fully visible
361
+
362
+ #Set annotation properties for Bluebeam Count detection
363
+ annot.set_info("name", lbl) # Unique name for each count
364
+ annot.set_info("subject", "Count") #Bluebeam uses "Count" for Count markups
365
+ annot.set_info("title", lbl) # Optional
366
+ annot.update() # Apply changes
367
+
368
+ #Save modified PDF to a variable instead of a file
369
+ output_stream = io.BytesIO()
370
+ pdf_document.save(output_stream)
371
+ pdf_document.close()
372
+
373
+ return output_stream.getvalue() # Return the modified PDF as bytes
374
+
375
+ def modify_author_in_pypdf2(pdf_bytes, new_authors):
376
+ pdf_stream = io.BytesIO(pdf_bytes) # Load PDF from bytes
377
+ reader = PyPDF2.PdfReader(pdf_stream)
378
+ writer = PyPDF2.PdfWriter()
379
+
380
+ author_index = 0 # Track author assignment
381
+
382
+ for page in reader.pages:
383
+ if "/Annots" in page: #Check if annotations exist
384
+ for annot in page["/Annots"]:
385
+ annot_obj = annot.get_object()
386
+
387
+ # Assign each annotation a unique author
388
+ if author_index < len(new_authors):
389
+ annot_obj.update({"/T": TextStringObject(new_authors[author_index])})#Convert to PdfString
390
+ author_index += 1 # Move to next author
391
+
392
+ # If authors list is exhausted, keep the last one
393
+ else:
394
+ annot_obj.update({"/T": TextStringObject(new_authors[-1])})
395
+
396
+ writer.add_page(page)
397
+
398
+ #Save the modified PDF to a variable
399
+ output_stream = io.BytesIO()
400
+ writer.write(output_stream)
401
+ output_stream.seek(0)
402
+
403
+ return output_stream.read()
404
+
405
+ # return output_stream.getvalue() # Return modified PDF as bytes
406
+
407
+ def process_pdf(input_pdf_path, output_pdf_path, locations, new_authors):
408
+ #Load original PDF
409
+ # with open(input_pdf_path, "rb") as file:
410
+ # original_pdf_bytes = file.read()
411
+
412
+ #Add Bluebeam-compatible count annotations
413
+ annotated_pdf_bytes = add_bluebeam_count_annotations(input_pdf_path, locations)
414
+
415
+ #Modify author field using PyPDF2
416
+ final_pdf_bytes = modify_author_in_pypdf2(annotated_pdf_bytes, new_authors)
417
+ return final_pdf_bytes
418
+ # #Save the final modified PDF to disk
419
+ # with open(output_pdf_path, "wb") as file:
420
+ # file.write(final_pdf_bytes)
421
+
422
+ def mainRun(schedule, plan):
423
+ dfs = extract_tables(schedule)
424
+ selected_columns = get_selected_columns(dfs)
425
+ selected_columns_new = selected_columns[0][0]
426
+ df = selected_columns[0][1]
427
+ clm_idx = selected_columns[0][2]
428
+ clmn_name = selected_columns[0][3]
429
+ starting_row_index = selected_columns[0][4]
430
+ kelma = get_st_op_pattern(clm_idx, clmn_name, starting_row_index,df)
431
+ col_dict = get_similar_colors(selected_columns_new)
432
+ flattened_list = get_flattened_tuples_list(col_dict)
433
+ plan_texts = read_text(plan)
434
+ locations, not_found = get_word_locations_plan(flattened_list,plan_texts)
435
+ new_data = get_cleaned_data(locations)
436
+ repeated_labels = get_repeated_labels(locations)
437
+ width_info_tobeprinted = get_width_info_tobeprinted(new_data)
438
+ cleaned_width = get_cleaned_width(width_info_tobeprinted)
439
+ widths = get_widths_bb_format(cleaned_width, kelma)
440
+ final_pdf_bytes= process_pdf(plan, "final_output_width.pdf", new_data, widths)
441
+
442
+
443
+ doc2 =fitz.open('pdf',final_pdf_bytes)
444
+ page=doc2[0]
445
+ pix = page.get_pixmap() # render page to an image
446
+ pl=Image.frombytes('RGB', [pix.width,pix.height],pix.samples)
447
+ img=np.array(pl)
448
+ annotatedimg = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
449
+
450
+
451
+ list1=pd.DataFrame(columns=['content', 'id', 'subject','color'])
452
+
453
+ # for page in doc:
454
+ for page in doc2:
455
+ # Iterate through annotations on the page
456
+ for annot in page.annots():
457
+ # Get the color of the annotation
458
+ annot_color = annot.colors
459
+ if annot_color is not None:
460
+ # annot_color is a dictionary with 'stroke' and 'fill' keys
461
+ stroke_color = annot_color.get('stroke') # Border color
462
+ fill_color = annot_color.get('fill') # Fill color
463
+ if fill_color:
464
+ v='fill'
465
+ # print('fill')
466
+ if stroke_color:
467
+ v='stroke'
468
+ x,y,z=int(annot_color.get(v)[0]*255),int(annot_color.get(v)[1]*255),int(annot_color.get(v)[2]*255)
469
+ list1.loc[len(list1)] =[annot.info['content'],annot.info['id'],annot.info['subject'],[x,y,z]]
470
+ return annotatedimg, doc2 , list1, repeated_labels , not_found
471
+
472
+