Spaces:
Sleeping
Sleeping
Update Doors_Schedule.py
Browse files- Doors_Schedule.py +34 -15
Doors_Schedule.py
CHANGED
|
@@ -52,8 +52,10 @@ def search_columns(df):
|
|
| 52 |
|
| 53 |
door_id_pattern = r'\b(?:door\s*)?(?:id|no|number)(?!-)\b'
|
| 54 |
door_type_pattern = r'^\s*(?:\S*\s+)?door\s*[\n\s]*type\s*$|^type\s*$'
|
| 55 |
-
width_pattern = r'^\s*(?:WIDTH|Width|width)\s*$'
|
| 56 |
-
height_pattern = r'^\s*(?:HEIGHT|Height|height)\s*$'
|
|
|
|
|
|
|
| 57 |
structural_opening_pattern = r'\b(?:Structural\s+opening|structural\s+opening)\b'
|
| 58 |
|
| 59 |
# Function to search in column names and return column indices
|
|
@@ -152,20 +154,37 @@ def crop_rename_table(indices, clmn_name, clmn_idx,df):
|
|
| 152 |
|
| 153 |
return slctd_clms
|
| 154 |
|
|
|
|
|
|
|
|
|
|
| 155 |
def details_in_another_table(clmn_name, clmn_idx, current_dfs, dfs):
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
|
| 164 |
-
new_df = pd.concat([column_names_row, new_df], ignore_index=True)
|
| 165 |
|
| 166 |
-
|
| 167 |
-
new_df.columns = clmn_name
|
| 168 |
-
return new_df
|
| 169 |
|
| 170 |
def extract_tables(schedule):
|
| 171 |
doc = fitz.open("pdf",schedule)
|
|
@@ -191,7 +210,7 @@ def get_selected_columns(dfs):
|
|
| 191 |
print(f"this is df {i} MIX, search in another df but make sure of the length")
|
| 192 |
|
| 193 |
#IN COLUMNS
|
| 194 |
-
if len(starting_row_index) == 0:
|
| 195 |
print(f"this is df {i} mawgooda fel columns, check el df length 3ashan law el details fe table tany")
|
| 196 |
#details in another table
|
| 197 |
if len(dfs[i]) <10:
|
|
@@ -199,7 +218,7 @@ def get_selected_columns(dfs):
|
|
| 199 |
selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
|
| 200 |
#details in the same table
|
| 201 |
if len(dfs[i]) >10:
|
| 202 |
-
selected_columns_new = generate_current_table_without_cropping(clmn_idx,
|
| 203 |
selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
|
| 204 |
|
| 205 |
#IN CELLS
|
|
|
|
| 52 |
|
| 53 |
door_id_pattern = r'\b(?:door\s*)?(?:id|no|number)(?!-)\b'
|
| 54 |
door_type_pattern = r'^\s*(?:\S*\s+)?door\s*[\n\s]*type\s*$|^type\s*$'
|
| 55 |
+
#width_pattern = r'^\s*(?:WIDTH|Width|width)\s*$'
|
| 56 |
+
#height_pattern = r'^\s*(?:HEIGHT|Height|height)\s*$'
|
| 57 |
+
width_pattern = r'^\s*width\s*(?:\n|\s)+.*$'
|
| 58 |
+
height_pattern = r'^\s*height\s*(?:\n|\s)+.*$'
|
| 59 |
structural_opening_pattern = r'\b(?:Structural\s+opening|structural\s+opening)\b'
|
| 60 |
|
| 61 |
# Function to search in column names and return column indices
|
|
|
|
| 154 |
|
| 155 |
return slctd_clms
|
| 156 |
|
| 157 |
+
def clean_column_row(row):
|
| 158 |
+
return [re.sub(r'^\d+-\s*', '', str(cell)) for cell in row]
|
| 159 |
+
|
| 160 |
def details_in_another_table(clmn_name, clmn_idx, current_dfs, dfs):
|
| 161 |
+
matching_dfs = [
|
| 162 |
+
dff for dff in dfs
|
| 163 |
+
if dff is not current_dfs and current_dfs.shape[1] == dff.shape[1]
|
| 164 |
+
]
|
| 165 |
+
|
| 166 |
+
if not matching_dfs:
|
| 167 |
+
return None
|
| 168 |
+
|
| 169 |
+
updated_dfs = []
|
| 170 |
+
for dff in matching_dfs:
|
| 171 |
+
selected_dff = dff.iloc[:, clmn_idx].copy()
|
| 172 |
+
|
| 173 |
+
# Clean the column names and make them a row
|
| 174 |
+
cleaned_header = clean_column_row(selected_dff.columns.tolist())
|
| 175 |
+
col_names_as_row = pd.DataFrame([cleaned_header])
|
| 176 |
+
|
| 177 |
+
# Rename columns
|
| 178 |
+
selected_dff.columns = clmn_name
|
| 179 |
+
col_names_as_row.columns = clmn_name
|
| 180 |
+
|
| 181 |
+
# Combine the cleaned row with data
|
| 182 |
+
temp_df = pd.concat([col_names_as_row, selected_dff], ignore_index=True)
|
| 183 |
+
updated_dfs.append(temp_df)
|
| 184 |
|
| 185 |
+
combined_df = pd.concat(updated_dfs, ignore_index=True)
|
|
|
|
| 186 |
|
| 187 |
+
return combined_df
|
|
|
|
|
|
|
| 188 |
|
| 189 |
def extract_tables(schedule):
|
| 190 |
doc = fitz.open("pdf",schedule)
|
|
|
|
| 210 |
print(f"this is df {i} MIX, search in another df but make sure of the length")
|
| 211 |
|
| 212 |
#IN COLUMNS
|
| 213 |
+
if len(starting_row_index) == 0 and len(clm_idx)>2:
|
| 214 |
print(f"this is df {i} mawgooda fel columns, check el df length 3ashan law el details fe table tany")
|
| 215 |
#details in another table
|
| 216 |
if len(dfs[i]) <10:
|
|
|
|
| 218 |
selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
|
| 219 |
#details in the same table
|
| 220 |
if len(dfs[i]) >10:
|
| 221 |
+
selected_columns_new = generate_current_table_without_cropping(clmn_idx,clmn_name, dfs[i])
|
| 222 |
selected_columns.append((selected_columns_new, dfs[i],clm_idx, clmn_name, starting_row_index))
|
| 223 |
|
| 224 |
#IN CELLS
|