Marthee commited on
Commit
9c9a7e1
·
verified ·
1 Parent(s): 3dbac4d

Update Doors_Schedule.py

Browse files
Files changed (1) hide show
  1. Doors_Schedule.py +34 -15
Doors_Schedule.py CHANGED
@@ -52,8 +52,10 @@ def search_columns(df):
52
 
53
  door_id_pattern = r'\b(?:door\s*)?(?:id|no|number)(?!-)\b'
54
  door_type_pattern = r'^\s*(?:\S*\s+)?door\s*[\n\s]*type\s*$|^type\s*$'
55
- width_pattern = r'^\s*(?:WIDTH|Width|width)\s*$'
56
- height_pattern = r'^\s*(?:HEIGHT|Height|height)\s*$'
 
 
57
  structural_opening_pattern = r'\b(?:Structural\s+opening|structural\s+opening)\b'
58
 
59
  # Function to search in column names and return column indices
@@ -152,20 +154,37 @@ def crop_rename_table(indices, clmn_name, clmn_idx,df):
152
 
153
  return slctd_clms
154
 
 
 
 
155
  def details_in_another_table(clmn_name, clmn_idx, current_dfs, dfs):
156
- for dff in dfs:
157
- if dff.shape[1] == current_dfs.shape[1]:
158
- df = dff
159
- # Create a new DataFrame with selected columns
160
- new_df = df.iloc[:, clmn_idx].copy() # Use .copy() to avoid modifying original df
161
- column_names_row = pd.DataFrame([new_df.columns], columns=new_df.columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- # Append the original data below the column names row
164
- new_df = pd.concat([column_names_row, new_df], ignore_index=True)
165
 
166
- # Rename columns
167
- new_df.columns = clmn_name
168
- return new_df
169
 
170
  def extract_tables(schedule):
171
  doc = fitz.open("pdf",schedule)
@@ -191,7 +210,7 @@ def get_selected_columns(dfs):
191
  print(f"this is df {i} MIX, search in another df but make sure of the length")
192
 
193
  #IN COLUMNS
194
- if len(starting_row_index) == 0:
195
  print(f"this is df {i} mawgooda fel columns, check el df length 3ashan law el details fe table tany")
196
  #details in another table
197
  if len(dfs[i]) <10:
@@ -199,7 +218,7 @@ def get_selected_columns(dfs):
199
  selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
200
  #details in the same table
201
  if len(dfs[i]) >10:
202
- selected_columns_new = generate_current_table_without_cropping(clmn_idx, clmn_name, dfs[i])
203
  selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
204
 
205
  #IN CELLS
 
52
 
53
  door_id_pattern = r'\b(?:door\s*)?(?:id|no|number)(?!-)\b'
54
  door_type_pattern = r'^\s*(?:\S*\s+)?door\s*[\n\s]*type\s*$|^type\s*$'
55
+ #width_pattern = r'^\s*(?:WIDTH|Width|width)\s*$'
56
+ #height_pattern = r'^\s*(?:HEIGHT|Height|height)\s*$'
57
+ width_pattern = r'^\s*width\s*(?:\n|\s)+.*$'
58
+ height_pattern = r'^\s*height\s*(?:\n|\s)+.*$'
59
  structural_opening_pattern = r'\b(?:Structural\s+opening|structural\s+opening)\b'
60
 
61
  # Function to search in column names and return column indices
 
154
 
155
  return slctd_clms
156
 
157
+ def clean_column_row(row):
158
+ return [re.sub(r'^\d+-\s*', '', str(cell)) for cell in row]
159
+
160
  def details_in_another_table(clmn_name, clmn_idx, current_dfs, dfs):
161
+ matching_dfs = [
162
+ dff for dff in dfs
163
+ if dff is not current_dfs and current_dfs.shape[1] == dff.shape[1]
164
+ ]
165
+
166
+ if not matching_dfs:
167
+ return None
168
+
169
+ updated_dfs = []
170
+ for dff in matching_dfs:
171
+ selected_dff = dff.iloc[:, clmn_idx].copy()
172
+
173
+ # Clean the column names and make them a row
174
+ cleaned_header = clean_column_row(selected_dff.columns.tolist())
175
+ col_names_as_row = pd.DataFrame([cleaned_header])
176
+
177
+ # Rename columns
178
+ selected_dff.columns = clmn_name
179
+ col_names_as_row.columns = clmn_name
180
+
181
+ # Combine the cleaned row with data
182
+ temp_df = pd.concat([col_names_as_row, selected_dff], ignore_index=True)
183
+ updated_dfs.append(temp_df)
184
 
185
+ combined_df = pd.concat(updated_dfs, ignore_index=True)
 
186
 
187
+ return combined_df
 
 
188
 
189
  def extract_tables(schedule):
190
  doc = fitz.open("pdf",schedule)
 
210
  print(f"this is df {i} MIX, search in another df but make sure of the length")
211
 
212
  #IN COLUMNS
213
+ if len(starting_row_index) == 0 and len(clm_idx)>2:
214
  print(f"this is df {i} mawgooda fel columns, check el df length 3ashan law el details fe table tany")
215
  #details in another table
216
  if len(dfs[i]) <10:
 
218
  selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
219
  #details in the same table
220
  if len(dfs[i]) >10:
221
+ selected_columns_new = generate_current_table_without_cropping(clmn_idx,clmn_name, dfs[i])
222
  selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
223
 
224
  #IN CELLS