Marthee commited on
Commit
9e12634
·
verified ·
1 Parent(s): c562e63

Create Doors_Schedule

Browse files
Files changed (1) hide show
  1. Doors_Schedule +229 -0
Doors_Schedule ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import pandas as pd
3
+ import random
4
+ import re
5
+ import io
6
+ import pypdfium2 as pdfium
7
+ import fitz
8
+ from PIL import Image, ImageDraw
9
+ from PyPDF2 import PdfReader, PdfWriter
10
+ from PyPDF2.generic import TextStringObject, NameObject, ArrayObject, FloatObject
11
+ from PyPDF2.generic import NameObject, TextStringObject, DictionaryObject, FloatObject, ArrayObject
12
+ from PyPDF2 import PdfReader
13
+ from PyPDF2.generic import TextStringObject
14
+
15
+
16
+
17
+
18
+ def convert2img(path):
19
+ pdf = pdfium.PdfDocument(path)
20
+ page = pdf.get_page(0)
21
+ pil_image = page.render().to_pil()
22
+ pl1=np.array(pil_image)
23
+ img = cv2.cvtColor(pl1, cv2.COLOR_RGB2BGR)
24
+ return img
25
+
26
+ def convert2pillow(path):
27
+ pdf = pdfium.PdfDocument(path)
28
+ page = pdf.get_page(0)
29
+ pil_image = page.render().to_pil()
30
+ return pil_image
31
+
32
+ def calculate_midpoint(x1,y1,x2,y2):
33
+ xm = int((x1 + x2) / 2)
34
+ ym = int((y1 + y2) / 2)
35
+ return (xm, ym)
36
+
37
+ def read_text(input_pdf_path):
38
+ pdf_document = fitz.open(input_pdf_path)
39
+
40
+ for page_num in range(pdf_document.page_count):
41
+ page = pdf_document[page_num]
42
+ text_instances = page.get_text("words")
43
+
44
+ page.apply_redactions()
45
+ return text_instances
46
+
47
+ def search_columns(df):
48
+ import pandas as pd
49
+ import re
50
+
51
+ # Define patterns
52
+
53
+ door_id_pattern = r'\b(?:door\s*)?(?:id|no|number)(?!-)\b'
54
+ door_type_pattern = r'^\s*(?:\S*\s+)?door\s*[\n\s]*type\s*$|^type\s*$'
55
+ width_pattern = r'^\s*(?:WIDTH|Width|width)\s*$'
56
+ height_pattern = r'^\s*(?:HEIGHT|Height|height)\s*$'
57
+ structural_opening_pattern = r'\b(?:Structural\s+opening|structural\s+opening)\b'
58
+
59
+ # Function to search in column names and return column indices
60
+ def find_column_indices(df, patterns):
61
+ matches = {}
62
+ for key, pattern in patterns.items():
63
+ indices = [i for i, col in enumerate(df.columns) if re.search(pattern, col, re.IGNORECASE)]
64
+ if indices:
65
+ matches[key] = indices # Store column index if found
66
+ return matches
67
+
68
+ # Function to search in cells and return (row index, column index) pairs
69
+ def find_matches_in_cells(df, patterns):
70
+ matches = {}
71
+ for key, pattern in patterns.items():
72
+ found = []
73
+ for row_idx in range(min(2, len(df))): # Limit to the first two rows
74
+ for col_idx in range(len(df.columns)):
75
+ cell = df.iat[row_idx, col_idx]
76
+ if isinstance(cell, str) and re.search(pattern, cell, re.IGNORECASE):
77
+ found.append((row_idx, col_idx)) # Store (row index, column index)
78
+ if found:
79
+ matches[key] = found # Store if any matches are found
80
+ return matches
81
+
82
+ # Search in column names first
83
+ patterns = {
84
+ "door_id": door_id_pattern,
85
+ "door_type": door_type_pattern,
86
+ "width": width_pattern,
87
+ "height": height_pattern
88
+ }
89
+ column_matches = find_column_indices(df, patterns)
90
+
91
+ # If door_id and door_type are NOT found in column names, search in cells
92
+ if "door_id" not in column_matches and "door_type" not in column_matches:
93
+ cell_matches = find_matches_in_cells(df, {"door_id": door_id_pattern, "door_type": door_type_pattern})
94
+ column_matches.update(cell_matches) # Merge results
95
+
96
+ # If width and height are NOT found in column names, search for them in cells
97
+ if "width" not in column_matches and "height" not in column_matches:
98
+ cell_matches = find_matches_in_cells(df, {"width": width_pattern, "height": height_pattern})
99
+ column_matches.update(cell_matches) # Merge results
100
+
101
+ # If width and height are still NOT found, search for structural opening in column names
102
+ if "width" not in column_matches or "height" not in column_matches:
103
+ structural_opening_match = find_column_indices(df, {"structural opening": structural_opening_pattern})
104
+ column_matches.update(structural_opening_match)
105
+
106
+ # If structural opening is also NOT found in column names, search in cells
107
+ if "structural opening" not in column_matches:
108
+ structural_opening_match = find_matches_in_cells(df, {"structural opening": structural_opening_pattern})
109
+ column_matches.update(structural_opening_match)
110
+
111
+ # Print results
112
+ #print(column_matches)
113
+ return column_matches
114
+
115
+ def row_clmn_indices(column_matches):
116
+ clm_idx = []
117
+ starting_row_index = []
118
+ for key in column_matches.keys():
119
+ if type(column_matches[key][0]) == tuple:
120
+ clm_idx.append((key,column_matches[key][0][1]))
121
+ starting_row_index.append(column_matches[key][0][0])
122
+ else:
123
+ clm_idx.append((key,column_matches[key][0]))
124
+ return clm_idx, starting_row_index
125
+
126
+
127
+ def generate_current_table_without_cropping(clm_idx,df):
128
+ selected_df = df.iloc[:, clm_idx]
129
+ print("hello I generated the selected columns table without cropping")
130
+ return selected_df
131
+
132
+ def column_name_index(clm_idx):
133
+ clmn_name = []
134
+ clmn_idx = []
135
+ for indd in clm_idx:
136
+ cl_nm, cl_idx = indd
137
+ clmn_name.append(cl_nm)
138
+ clmn_idx.append(cl_idx)
139
+ return clmn_name, clmn_idx
140
+
141
+ def crop_rename_table(indices, clmn_name, clmn_idx,df):
142
+ #crop_at = (max(set(indices), key=indices.count)) + 1
143
+ crop_at = max(indices) + 1
144
+
145
+ df = df.iloc[crop_at:] # Starts from row index 5 (zero-based index)
146
+ df.reset_index(drop=True, inplace=True) # Reset index after cropping
147
+
148
+
149
+ slctd_clms = df.iloc[:, clmn_idx] # Select columns by index
150
+ slctd_clms.columns = clmn_name # Rename selected columns
151
+
152
+ return slctd_clms
153
+
154
+ def details_in_another_table(clmn_name, clmn_idx, current_dfs, dfs):
155
+ for dff in dfs:
156
+ if dff.shape[1] == current_dfs.shape[1]:
157
+ df = dff
158
+ # Create a new DataFrame with selected columns
159
+ new_df = df.iloc[:, clmn_idx].copy() # Use .copy() to avoid modifying original df
160
+ column_names_row = pd.DataFrame([new_df.columns], columns=new_df.columns)
161
+
162
+ # Append the original data below the column names row
163
+ new_df = pd.concat([column_names_row, new_df], ignore_index=True)
164
+
165
+ # Rename columns
166
+ new_df.columns = clmn_name
167
+ return new_df
168
+
169
+ def extract_tables(schedule):
170
+ doc = fitz.open(schedule)
171
+ for page in doc:
172
+ tabs = page.find_tables()
173
+ dfs = []
174
+ for tab in tabs:
175
+ df = tab.to_pandas()
176
+ dfs.append(df)
177
+ return dfs
178
+
179
+ def get_selected_columns(dfs):
180
+ selected_columns = []
181
+ for i in range(len(dfs)):
182
+ column_matches = search_columns(dfs[i])
183
+ clm_idx, starting_row_index = row_clmn_indices(column_matches)
184
+ clmn_name, clmn_idx = column_name_index(clm_idx)
185
+ if len(clm_idx) == 0 and len(starting_row_index) == 0:
186
+ print(f"this is df {i}, SEARCH IN ANOTHER DF")
187
+ else:
188
+ #MIX
189
+ if (len(clm_idx) != len(starting_row_index)) and len(starting_row_index) > 0:
190
+ print(f"this is df {i} MIX, search in another df but make sure of the length")
191
+
192
+ #IN COLUMNS
193
+ if len(starting_row_index) == 0:
194
+ print(f"this is df {i} mawgooda fel columns, check el df length 3ashan law el details fe table tany")
195
+ #details in another table
196
+ if len(dfs[i]) <10:
197
+ selected_columns_new = details_in_another_table(clmn_name, clmn_idx, dfs[i], dfs)
198
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
199
+ #details in the same table
200
+ if len(dfs[i]) >10:
201
+ selected_columns_new = generate_current_table_without_cropping(clmn_idx,dfs[i])
202
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
203
+
204
+ #IN CELLS
205
+ if len(starting_row_index) == len(clm_idx):
206
+ print(f"this is df {i} mawgooda fel cells, check el df length 3ashan law el details fe table tany")
207
+
208
+ #details in another table
209
+ if len(dfs[i]) <10:
210
+ selected_columns_new = details_in_another_table(clmn_name, clmn_idx, dfs[i], dfs)
211
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
212
+ #details in the same table
213
+ if len(dfs[i]) >10:
214
+ print(f"this is df {i} call crop_rename_table(indices, clmn_name, clmn_idx,df)")
215
+ selected_columns_new = crop_rename_table(starting_row_index, clmn_name, clmn_idx,dfs[i])
216
+ selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
217
+ return selected_columns
218
+
219
+ def get_st_op_pattern(clm_idx, clmn_name, starting_row_index):
220
+ target = 'structural opening'
221
+ clm_dict = dict(clm_idx) # Convert list of tuples to dictionary
222
+ structural_opening_value = clm_dict.get(target) # Returns None if not found
223
+
224
+ if target in clmn_name:
225
+ position = clmn_name.index(target)
226
+
227
+ kelma = df.iloc[starting_row_index[position], structural_opening_value]
228
+ return kelma
229
+