Marthee commited on
Commit
bfb30ff
·
verified ·
1 Parent(s): 03568ae

Update Doors_Schedule.py

Browse files
Files changed (1) hide show
  1. Doors_Schedule.py +92 -60
Doors_Schedule.py CHANGED
@@ -32,6 +32,8 @@ from collections import defaultdict
32
  from xml.etree.ElementTree import Element, SubElement, tostring
33
  from azure.ai.formrecognizer import DocumentAnalysisClient
34
  from azure.core.credentials import AzureKeyCredential
 
 
35
 
36
  def convert2img(path):
37
  pdf = pdfium.PdfDocument(path)
@@ -110,12 +112,16 @@ def flexible_search(df, search_terms):
110
  return results
111
 
112
 
113
- def generate_current_table_without_cropping(clm_idx, clmn_name, df):
114
  selected_df = df.iloc[:, clm_idx]
115
  print("hello I generated the selected columns table without cropping")
116
  selected_df.columns = clmn_name
117
- return selected_df
118
 
 
 
 
 
119
 
120
  def crop_rename_table(indices, clmn_name, clmn_idx,df):
121
  #crop_at = (max(set(indices), key=indices.count)) + 1
@@ -496,6 +502,8 @@ def get_selected_columns_all(dfs, user_patterns):
496
  #details in the same table
497
  if len(dfs[i]) >10:
498
  selected_columns_new = generate_current_table_without_cropping(column_index_list,dfs[i])
 
 
499
  #break
500
 
501
  #IN CELLS
@@ -1670,32 +1678,48 @@ def pick_approach(schedule, plan, searcharray, flag):
1670
 
1671
  return no_tables, not_found_any_plan
1672
 
1673
- def mainRun(schedule, plan, searcharray):
1674
- print("mainRun is RUNNING")
1675
- no_tables_normal, not_found_any_plan_normal = pick_approach(schedule, plan, searcharray, 1)
1676
- no_tables_model, not_found_any_plan_model = pick_approach(schedule, plan, searcharray, 2)
1677
- pick_normal = False
1678
- pick_model = False
1679
- if no_tables_model:
1680
- pick_normal = True
1681
- #print("choose normal")
1682
- elif no_tables_normal:
1683
- pick_model = True
1684
- #print("choose model")
1685
- elif no_tables_model and no_tables_normal:
1686
- print("el etneen bayzeen")
1687
- else:
1688
- ## Decide according to the not found labels
1689
- #print("el etneen shaghaleen")
1690
- if len(not_found_any_plan_model) > len(not_found_any_plan_normal):
1691
- #print("choose not_found_any_plan_normal")
1692
  pick_normal = True
1693
- elif len(not_found_any_plan_model) < len(not_found_any_plan_normal):
 
1694
  pick_model = True
1695
- #print("choose not_found_any_plan_model")
1696
- else: # law ad ba3d choose the older approach (fitz)
1697
- pick_normal = True
1698
- #print("choose any")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1699
 
1700
  #print(type(plan))
1701
  eltype = type(plan)
@@ -1735,10 +1759,14 @@ def mainRun(schedule, plan, searcharray):
1735
  #width_plan = math.ceil(width_plan)
1736
  #height_plan = math.ceil(height_plan)
1737
  for k in range(len(schedule)):
1738
- if pick_normal:
1739
  dfs = extract_tables(schedule[k])
1740
- if pick_model:
1741
  dfs = extract_tables_model(schedule[k])
 
 
 
 
1742
  user_input_this_schedule = searcharray[k]
1743
  for j in range(len(user_input_this_schedule)):
1744
  user_input = user_input_this_schedule[j]
@@ -1758,20 +1786,21 @@ def mainRun(schedule, plan, searcharray):
1758
  print("mafeesh secondary information")
1759
 
1760
  selected_columns_combined = get_selected_columns_all(dfs, user_input)
1761
- if selected_columns_combined is None:
1762
- dfs_normal = extract_tables(schedule[k])
1763
- column_indices = get_column_indices_from_dfs_normal(dfs_normal, user_input)
1764
- if column_indices is None:
1765
- missing_clmns = check_missing(dfs, user_input)
1766
- missing_message = f"{missing_clmns} can't be extracted from table input {j+1} in schedule {k+1}"
1767
- missings.append(missing_message)
1768
-
1769
- continue # continue to the next user input
1770
- if len(dfs) == 1:
1771
- selected_columns_combined = get_selected_columns_by_index(dfs[0], column_indices, user_input)
1772
- if len(dfs) > 1:
1773
- index_df = get_df_index(dfs, user_input)
1774
- selected_columns_combined = get_selected_columns_by_index(dfs[index_df], column_indices, user_input)
 
1775
  selected_columns_combined = selected_columns_combined.applymap(lambda x: 'N/A' if isinstance(x, str) and x.strip() == '' else x)
1776
  selected_columns_combined = selected_columns_combined.fillna('N/A')
1777
  selected_columns_combined = selected_columns_combined.replace(r'(?i)\bn/a\b', 'N/A', regex=True)
@@ -1802,6 +1831,8 @@ def mainRun(schedule, plan, searcharray):
1802
  df_points = grouped_to_dataframe_dynamic(col_dict,
1803
  drop_empty_locations=True,
1804
  explode_locations=True)
 
 
1805
  # handling no door type in the new dictionary logic
1806
  if 'color' not in df_points:
1807
  df_points['color'] = (0, 0, 255)
@@ -1809,24 +1840,25 @@ def mainRun(schedule, plan, searcharray):
1809
  repeated_ids = dupes[dupes > 1].index.to_list()
1810
  repeated_labels_list.append(repeated_ids)
1811
 
1812
- if kelma:
1813
- lst_st_op = df_points["structural_opening"].tolist()
1814
- cleaned_st_op = get_cleaned_width(lst_st_op)
1815
- widths, heights = get_widths_bb_format_st_op(cleaned_st_op, kelma)
1816
- # remove a column (returns a new df)
1817
- df_points = df_points.drop(columns=['structural_opening'])
1818
-
1819
- # add two columns (scalars, lists/arrays/Series of length len(df), or expressions)
1820
- df_points['width'] = widths # e.g., a list/Series/np.array or a scalar
1821
- df_points['height'] = heights
1822
- else:
1823
- lst_width = df_points["width"].tolist()
1824
- lst_height = df_points["height"].tolist()
1825
- clean_widths, clean_height = get_width_clean_width_height(lst_width, lst_height)
1826
- df_points["width"] = clean_widths
1827
- df_points["height"] = clean_height
1828
- df_points = df_points.rename(columns={'width': 'Width_', 'height':'Height_'})
1829
-
 
1830
  #if kelma == None:
1831
  #widths, secondary_tobeprinted = get_width_info_tobeprinted_secondary(new_data3, main_info, secondary_info)
1832
  #else:
 
32
  from xml.etree.ElementTree import Element, SubElement, tostring
33
  from azure.ai.formrecognizer import DocumentAnalysisClient
34
  from azure.core.credentials import AzureKeyCredential
35
+ import chardet
36
+
37
 
38
  def convert2img(path):
39
  pdf = pdfium.PdfDocument(path)
 
112
  return results
113
 
114
 
115
+ """def generate_current_table_without_cropping(clm_idx, clmn_name, df):
116
  selected_df = df.iloc[:, clm_idx]
117
  print("hello I generated the selected columns table without cropping")
118
  selected_df.columns = clmn_name
119
+ return selected_df"""
120
 
121
+ def generate_current_table_without_cropping(clm_idx,df):
122
+ selected_df = df.iloc[:, clm_idx]
123
+ print("hello I generated the selected columns table without cropping")
124
+ return selected_df
125
 
126
  def crop_rename_table(indices, clmn_name, clmn_idx,df):
127
  #crop_at = (max(set(indices), key=indices.count)) + 1
 
502
  #details in the same table
503
  if len(dfs[i]) >10:
504
  selected_columns_new = generate_current_table_without_cropping(column_index_list,dfs[i])
505
+ selected_columns_new.columns = clmn_name # must match number of columns
506
+
507
  #break
508
 
509
  #IN CELLS
 
1678
 
1679
  return no_tables, not_found_any_plan
1680
 
1681
+ def get_df_csv(sch):
1682
+ with open(sch, "rb") as f:
1683
+ raw = f.read(100_000) # read first 100 KB (enough for detection)
1684
+ guess = chardet.detect(raw)
1685
+ #print(guess) # {'encoding': 'Windows-1252', 'confidence': 0.73, ...}
1686
+ encoding = guess["encoding"] or "utf-8" # fallback
1687
+ df = pd.read_csv(sch, encoding=encoding)
1688
+ return df
1689
+
1690
+ def mainRun(schedule, plan, searcharray, sch_csv_pdf):
1691
+ if sch_csv_pdf:
1692
+ print("shcedule type is PDF")
1693
+ no_tables_normal, not_found_any_plan_normal = pick_approach(schedule, plan, searcharray, 1)
1694
+ no_tables_model, not_found_any_plan_model = pick_approach(schedule, plan, searcharray, 2)
1695
+ pick_normal = False
1696
+ pick_model = False
1697
+ if no_tables_model:
 
 
1698
  pick_normal = True
1699
+ #print("choose normal")
1700
+ elif no_tables_normal:
1701
  pick_model = True
1702
+ #print("choose model")
1703
+ elif no_tables_model and no_tables_normal:
1704
+ print("el etneen bayzeen")
1705
+ else:
1706
+ ## Decide according to the not found labels
1707
+ #print("el etneen shaghaleen")
1708
+ if len(not_found_any_plan_model) > len(not_found_any_plan_normal):
1709
+ #print("choose not_found_any_plan_normal")
1710
+ pick_normal = True
1711
+ elif len(not_found_any_plan_model) < len(not_found_any_plan_normal):
1712
+ pick_model = True
1713
+ #print("choose not_found_any_plan_model")
1714
+ else: # law ad ba3d choose the older approach (fitz)
1715
+ pick_normal = True
1716
+ #print("choose any")
1717
+
1718
+ else:
1719
+ print("schedule type is CSV")
1720
+ df = get_df_csv(schedule[0])
1721
+ print(df)
1722
+ print("mainRun is RUNNING")
1723
 
1724
  #print(type(plan))
1725
  eltype = type(plan)
 
1759
  #width_plan = math.ceil(width_plan)
1760
  #height_plan = math.ceil(height_plan)
1761
  for k in range(len(schedule)):
1762
+ if sch_csv_pdf and pick_normal:
1763
  dfs = extract_tables(schedule[k])
1764
+ if sch_csv_pdf and pick_model:
1765
  dfs = extract_tables_model(schedule[k])
1766
+
1767
+ if sch_csv_pdf == False:
1768
+ df = get_df_csv(schedule[k])
1769
+ dfs = [df]
1770
  user_input_this_schedule = searcharray[k]
1771
  for j in range(len(user_input_this_schedule)):
1772
  user_input = user_input_this_schedule[j]
 
1786
  print("mafeesh secondary information")
1787
 
1788
  selected_columns_combined = get_selected_columns_all(dfs, user_input)
1789
+ if sch_csv_pdf:
1790
+ if selected_columns_combined is None:
1791
+ dfs_normal = extract_tables(schedule[k])
1792
+ column_indices = get_column_indices_from_dfs_normal(dfs_normal, user_input)
1793
+ if column_indices is None:
1794
+ missing_clmns = check_missing(dfs, user_input)
1795
+ missing_message = f"{missing_clmns} can't be extracted from table input {j+1} in schedule {k+1}"
1796
+ missings.append(missing_message)
1797
+
1798
+ continue # continue to the next user input
1799
+ if len(dfs) == 1:
1800
+ selected_columns_combined = get_selected_columns_by_index(dfs[0], column_indices, user_input)
1801
+ if len(dfs) > 1:
1802
+ index_df = get_df_index(dfs, user_input)
1803
+ selected_columns_combined = get_selected_columns_by_index(dfs[index_df], column_indices, user_input)
1804
  selected_columns_combined = selected_columns_combined.applymap(lambda x: 'N/A' if isinstance(x, str) and x.strip() == '' else x)
1805
  selected_columns_combined = selected_columns_combined.fillna('N/A')
1806
  selected_columns_combined = selected_columns_combined.replace(r'(?i)\bn/a\b', 'N/A', regex=True)
 
1831
  df_points = grouped_to_dataframe_dynamic(col_dict,
1832
  drop_empty_locations=True,
1833
  explode_locations=True)
1834
+ df_points.columns = df_points.columns.str.strip().str.replace(r"\s+", "_", regex=True)
1835
+
1836
  # handling no door type in the new dictionary logic
1837
  if 'color' not in df_points:
1838
  df_points['color'] = (0, 0, 255)
 
1840
  repeated_ids = dupes[dupes > 1].index.to_list()
1841
  repeated_labels_list.append(repeated_ids)
1842
 
1843
+ if ('width' in df_points and 'height' in df_points) or 'structural_opening' in df_points:
1844
+ if kelma:
1845
+ lst_st_op = df_points["structural_opening"].tolist()
1846
+ cleaned_st_op = get_cleaned_width(lst_st_op)
1847
+ widths, heights = get_widths_bb_format_st_op(cleaned_st_op, kelma)
1848
+ # remove a column (returns a new df)
1849
+ df_points = df_points.drop(columns=['structural_opening'])
1850
+
1851
+ # add two columns (scalars, lists/arrays/Series of length len(df), or expressions)
1852
+ df_points['width'] = widths # e.g., a list/Series/np.array or a scalar
1853
+ df_points['height'] = heights
1854
+ else:
1855
+ lst_width = df_points["width"].tolist()
1856
+ lst_height = df_points["height"].tolist()
1857
+ clean_widths, clean_height = get_width_clean_width_height(lst_width, lst_height)
1858
+ df_points["width"] = clean_widths
1859
+ df_points["height"] = clean_height
1860
+ df_points = df_points.rename(columns={'width': 'Width_', 'height':'Height_'})
1861
+
1862
  #if kelma == None:
1863
  #widths, secondary_tobeprinted = get_width_info_tobeprinted_secondary(new_data3, main_info, secondary_info)
1864
  #else: