Update Doors_Schedule.py
Browse files- Doors_Schedule.py +92 -60
Doors_Schedule.py
CHANGED
|
@@ -32,6 +32,8 @@ from collections import defaultdict
|
|
| 32 |
from xml.etree.ElementTree import Element, SubElement, tostring
|
| 33 |
from azure.ai.formrecognizer import DocumentAnalysisClient
|
| 34 |
from azure.core.credentials import AzureKeyCredential
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def convert2img(path):
|
| 37 |
pdf = pdfium.PdfDocument(path)
|
|
@@ -110,12 +112,16 @@ def flexible_search(df, search_terms):
|
|
| 110 |
return results
|
| 111 |
|
| 112 |
|
| 113 |
-
def generate_current_table_without_cropping(clm_idx, clmn_name, df):
|
| 114 |
selected_df = df.iloc[:, clm_idx]
|
| 115 |
print("hello I generated the selected columns table without cropping")
|
| 116 |
selected_df.columns = clmn_name
|
| 117 |
-
return selected_df
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
def crop_rename_table(indices, clmn_name, clmn_idx,df):
|
| 121 |
#crop_at = (max(set(indices), key=indices.count)) + 1
|
|
@@ -496,6 +502,8 @@ def get_selected_columns_all(dfs, user_patterns):
|
|
| 496 |
#details in the same table
|
| 497 |
if len(dfs[i]) >10:
|
| 498 |
selected_columns_new = generate_current_table_without_cropping(column_index_list,dfs[i])
|
|
|
|
|
|
|
| 499 |
#break
|
| 500 |
|
| 501 |
#IN CELLS
|
|
@@ -1670,32 +1678,48 @@ def pick_approach(schedule, plan, searcharray, flag):
|
|
| 1670 |
|
| 1671 |
return no_tables, not_found_any_plan
|
| 1672 |
|
| 1673 |
-
def
|
| 1674 |
-
|
| 1675 |
-
|
| 1676 |
-
|
| 1677 |
-
|
| 1678 |
-
|
| 1679 |
-
|
| 1680 |
-
|
| 1681 |
-
|
| 1682 |
-
|
| 1683 |
-
|
| 1684 |
-
|
| 1685 |
-
|
| 1686 |
-
|
| 1687 |
-
|
| 1688 |
-
|
| 1689 |
-
|
| 1690 |
-
if len(not_found_any_plan_model) > len(not_found_any_plan_normal):
|
| 1691 |
-
#print("choose not_found_any_plan_normal")
|
| 1692 |
pick_normal = True
|
| 1693 |
-
|
|
|
|
| 1694 |
pick_model = True
|
| 1695 |
-
#print("choose
|
| 1696 |
-
|
| 1697 |
-
|
| 1698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1699 |
|
| 1700 |
#print(type(plan))
|
| 1701 |
eltype = type(plan)
|
|
@@ -1735,10 +1759,14 @@ def mainRun(schedule, plan, searcharray):
|
|
| 1735 |
#width_plan = math.ceil(width_plan)
|
| 1736 |
#height_plan = math.ceil(height_plan)
|
| 1737 |
for k in range(len(schedule)):
|
| 1738 |
-
if pick_normal:
|
| 1739 |
dfs = extract_tables(schedule[k])
|
| 1740 |
-
if pick_model:
|
| 1741 |
dfs = extract_tables_model(schedule[k])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1742 |
user_input_this_schedule = searcharray[k]
|
| 1743 |
for j in range(len(user_input_this_schedule)):
|
| 1744 |
user_input = user_input_this_schedule[j]
|
|
@@ -1758,20 +1786,21 @@ def mainRun(schedule, plan, searcharray):
|
|
| 1758 |
print("mafeesh secondary information")
|
| 1759 |
|
| 1760 |
selected_columns_combined = get_selected_columns_all(dfs, user_input)
|
| 1761 |
-
if
|
| 1762 |
-
|
| 1763 |
-
|
| 1764 |
-
|
| 1765 |
-
|
| 1766 |
-
|
| 1767 |
-
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
|
| 1772 |
-
|
| 1773 |
-
|
| 1774 |
-
|
|
|
|
| 1775 |
selected_columns_combined = selected_columns_combined.applymap(lambda x: 'N/A' if isinstance(x, str) and x.strip() == '' else x)
|
| 1776 |
selected_columns_combined = selected_columns_combined.fillna('N/A')
|
| 1777 |
selected_columns_combined = selected_columns_combined.replace(r'(?i)\bn/a\b', 'N/A', regex=True)
|
|
@@ -1802,6 +1831,8 @@ def mainRun(schedule, plan, searcharray):
|
|
| 1802 |
df_points = grouped_to_dataframe_dynamic(col_dict,
|
| 1803 |
drop_empty_locations=True,
|
| 1804 |
explode_locations=True)
|
|
|
|
|
|
|
| 1805 |
# handling no door type in the new dictionary logic
|
| 1806 |
if 'color' not in df_points:
|
| 1807 |
df_points['color'] = (0, 0, 255)
|
|
@@ -1809,24 +1840,25 @@ def mainRun(schedule, plan, searcharray):
|
|
| 1809 |
repeated_ids = dupes[dupes > 1].index.to_list()
|
| 1810 |
repeated_labels_list.append(repeated_ids)
|
| 1811 |
|
| 1812 |
-
if
|
| 1813 |
-
|
| 1814 |
-
|
| 1815 |
-
|
| 1816 |
-
|
| 1817 |
-
|
| 1818 |
-
|
| 1819 |
-
|
| 1820 |
-
|
| 1821 |
-
|
| 1822 |
-
|
| 1823 |
-
|
| 1824 |
-
|
| 1825 |
-
|
| 1826 |
-
|
| 1827 |
-
|
| 1828 |
-
|
| 1829 |
-
|
|
|
|
| 1830 |
#if kelma == None:
|
| 1831 |
#widths, secondary_tobeprinted = get_width_info_tobeprinted_secondary(new_data3, main_info, secondary_info)
|
| 1832 |
#else:
|
|
|
|
| 32 |
from xml.etree.ElementTree import Element, SubElement, tostring
|
| 33 |
from azure.ai.formrecognizer import DocumentAnalysisClient
|
| 34 |
from azure.core.credentials import AzureKeyCredential
|
| 35 |
+
import chardet
|
| 36 |
+
|
| 37 |
|
| 38 |
def convert2img(path):
|
| 39 |
pdf = pdfium.PdfDocument(path)
|
|
|
|
| 112 |
return results
|
| 113 |
|
| 114 |
|
| 115 |
+
"""def generate_current_table_without_cropping(clm_idx, clmn_name, df):
|
| 116 |
selected_df = df.iloc[:, clm_idx]
|
| 117 |
print("hello I generated the selected columns table without cropping")
|
| 118 |
selected_df.columns = clmn_name
|
| 119 |
+
return selected_df"""
|
| 120 |
|
| 121 |
+
def generate_current_table_without_cropping(clm_idx,df):
|
| 122 |
+
selected_df = df.iloc[:, clm_idx]
|
| 123 |
+
print("hello I generated the selected columns table without cropping")
|
| 124 |
+
return selected_df
|
| 125 |
|
| 126 |
def crop_rename_table(indices, clmn_name, clmn_idx,df):
|
| 127 |
#crop_at = (max(set(indices), key=indices.count)) + 1
|
|
|
|
| 502 |
#details in the same table
|
| 503 |
if len(dfs[i]) >10:
|
| 504 |
selected_columns_new = generate_current_table_without_cropping(column_index_list,dfs[i])
|
| 505 |
+
selected_columns_new.columns = clmn_name # must match number of columns
|
| 506 |
+
|
| 507 |
#break
|
| 508 |
|
| 509 |
#IN CELLS
|
|
|
|
| 1678 |
|
| 1679 |
return no_tables, not_found_any_plan
|
| 1680 |
|
| 1681 |
+
def get_df_csv(sch):
|
| 1682 |
+
with open(sch, "rb") as f:
|
| 1683 |
+
raw = f.read(100_000) # read first 100 KB (enough for detection)
|
| 1684 |
+
guess = chardet.detect(raw)
|
| 1685 |
+
#print(guess) # {'encoding': 'Windows-1252', 'confidence': 0.73, ...}
|
| 1686 |
+
encoding = guess["encoding"] or "utf-8" # fallback
|
| 1687 |
+
df = pd.read_csv(sch, encoding=encoding)
|
| 1688 |
+
return df
|
| 1689 |
+
|
| 1690 |
+
def mainRun(schedule, plan, searcharray, sch_csv_pdf):
|
| 1691 |
+
if sch_csv_pdf:
|
| 1692 |
+
print("shcedule type is PDF")
|
| 1693 |
+
no_tables_normal, not_found_any_plan_normal = pick_approach(schedule, plan, searcharray, 1)
|
| 1694 |
+
no_tables_model, not_found_any_plan_model = pick_approach(schedule, plan, searcharray, 2)
|
| 1695 |
+
pick_normal = False
|
| 1696 |
+
pick_model = False
|
| 1697 |
+
if no_tables_model:
|
|
|
|
|
|
|
| 1698 |
pick_normal = True
|
| 1699 |
+
#print("choose normal")
|
| 1700 |
+
elif no_tables_normal:
|
| 1701 |
pick_model = True
|
| 1702 |
+
#print("choose model")
|
| 1703 |
+
elif no_tables_model and no_tables_normal:
|
| 1704 |
+
print("el etneen bayzeen")
|
| 1705 |
+
else:
|
| 1706 |
+
## Decide according to the not found labels
|
| 1707 |
+
#print("el etneen shaghaleen")
|
| 1708 |
+
if len(not_found_any_plan_model) > len(not_found_any_plan_normal):
|
| 1709 |
+
#print("choose not_found_any_plan_normal")
|
| 1710 |
+
pick_normal = True
|
| 1711 |
+
elif len(not_found_any_plan_model) < len(not_found_any_plan_normal):
|
| 1712 |
+
pick_model = True
|
| 1713 |
+
#print("choose not_found_any_plan_model")
|
| 1714 |
+
else: # law ad ba3d choose the older approach (fitz)
|
| 1715 |
+
pick_normal = True
|
| 1716 |
+
#print("choose any")
|
| 1717 |
+
|
| 1718 |
+
else:
|
| 1719 |
+
print("schedule type is CSV")
|
| 1720 |
+
df = get_df_csv(schedule[0])
|
| 1721 |
+
print(df)
|
| 1722 |
+
print("mainRun is RUNNING")
|
| 1723 |
|
| 1724 |
#print(type(plan))
|
| 1725 |
eltype = type(plan)
|
|
|
|
| 1759 |
#width_plan = math.ceil(width_plan)
|
| 1760 |
#height_plan = math.ceil(height_plan)
|
| 1761 |
for k in range(len(schedule)):
|
| 1762 |
+
if sch_csv_pdf and pick_normal:
|
| 1763 |
dfs = extract_tables(schedule[k])
|
| 1764 |
+
if sch_csv_pdf and pick_model:
|
| 1765 |
dfs = extract_tables_model(schedule[k])
|
| 1766 |
+
|
| 1767 |
+
if sch_csv_pdf == False:
|
| 1768 |
+
df = get_df_csv(schedule[k])
|
| 1769 |
+
dfs = [df]
|
| 1770 |
user_input_this_schedule = searcharray[k]
|
| 1771 |
for j in range(len(user_input_this_schedule)):
|
| 1772 |
user_input = user_input_this_schedule[j]
|
|
|
|
| 1786 |
print("mafeesh secondary information")
|
| 1787 |
|
| 1788 |
selected_columns_combined = get_selected_columns_all(dfs, user_input)
|
| 1789 |
+
if sch_csv_pdf:
|
| 1790 |
+
if selected_columns_combined is None:
|
| 1791 |
+
dfs_normal = extract_tables(schedule[k])
|
| 1792 |
+
column_indices = get_column_indices_from_dfs_normal(dfs_normal, user_input)
|
| 1793 |
+
if column_indices is None:
|
| 1794 |
+
missing_clmns = check_missing(dfs, user_input)
|
| 1795 |
+
missing_message = f"{missing_clmns} can't be extracted from table input {j+1} in schedule {k+1}"
|
| 1796 |
+
missings.append(missing_message)
|
| 1797 |
+
|
| 1798 |
+
continue # continue to the next user input
|
| 1799 |
+
if len(dfs) == 1:
|
| 1800 |
+
selected_columns_combined = get_selected_columns_by_index(dfs[0], column_indices, user_input)
|
| 1801 |
+
if len(dfs) > 1:
|
| 1802 |
+
index_df = get_df_index(dfs, user_input)
|
| 1803 |
+
selected_columns_combined = get_selected_columns_by_index(dfs[index_df], column_indices, user_input)
|
| 1804 |
selected_columns_combined = selected_columns_combined.applymap(lambda x: 'N/A' if isinstance(x, str) and x.strip() == '' else x)
|
| 1805 |
selected_columns_combined = selected_columns_combined.fillna('N/A')
|
| 1806 |
selected_columns_combined = selected_columns_combined.replace(r'(?i)\bn/a\b', 'N/A', regex=True)
|
|
|
|
| 1831 |
df_points = grouped_to_dataframe_dynamic(col_dict,
|
| 1832 |
drop_empty_locations=True,
|
| 1833 |
explode_locations=True)
|
| 1834 |
+
df_points.columns = df_points.columns.str.strip().str.replace(r"\s+", "_", regex=True)
|
| 1835 |
+
|
| 1836 |
# handling no door type in the new dictionary logic
|
| 1837 |
if 'color' not in df_points:
|
| 1838 |
df_points['color'] = (0, 0, 255)
|
|
|
|
| 1840 |
repeated_ids = dupes[dupes > 1].index.to_list()
|
| 1841 |
repeated_labels_list.append(repeated_ids)
|
| 1842 |
|
| 1843 |
+
if ('width' in df_points and 'height' in df_points) or 'structural_opening' in df_points:
|
| 1844 |
+
if kelma:
|
| 1845 |
+
lst_st_op = df_points["structural_opening"].tolist()
|
| 1846 |
+
cleaned_st_op = get_cleaned_width(lst_st_op)
|
| 1847 |
+
widths, heights = get_widths_bb_format_st_op(cleaned_st_op, kelma)
|
| 1848 |
+
# remove a column (returns a new df)
|
| 1849 |
+
df_points = df_points.drop(columns=['structural_opening'])
|
| 1850 |
+
|
| 1851 |
+
# add two columns (scalars, lists/arrays/Series of length len(df), or expressions)
|
| 1852 |
+
df_points['width'] = widths # e.g., a list/Series/np.array or a scalar
|
| 1853 |
+
df_points['height'] = heights
|
| 1854 |
+
else:
|
| 1855 |
+
lst_width = df_points["width"].tolist()
|
| 1856 |
+
lst_height = df_points["height"].tolist()
|
| 1857 |
+
clean_widths, clean_height = get_width_clean_width_height(lst_width, lst_height)
|
| 1858 |
+
df_points["width"] = clean_widths
|
| 1859 |
+
df_points["height"] = clean_height
|
| 1860 |
+
df_points = df_points.rename(columns={'width': 'Width_', 'height':'Height_'})
|
| 1861 |
+
|
| 1862 |
#if kelma == None:
|
| 1863 |
#widths, secondary_tobeprinted = get_width_info_tobeprinted_secondary(new_data3, main_info, secondary_info)
|
| 1864 |
#else:
|