LatestDuplicate_Working

Paused

App Files Files Community

Marthee commited on Nov 19, 2025

Commit

bfb30ff

verified ·

1 Parent(s): 03568ae

Update Doors_Schedule.py

Browse files

Files changed (1) hide show

Doors_Schedule.py +92 -60

Doors_Schedule.py CHANGED Viewed

@@ -32,6 +32,8 @@ from collections import defaultdict
 from xml.etree.ElementTree import Element, SubElement, tostring
 from azure.ai.formrecognizer import DocumentAnalysisClient
 from azure.core.credentials import AzureKeyCredential
 def convert2img(path):
     pdf = pdfium.PdfDocument(path)
@@ -110,12 +112,16 @@ def flexible_search(df, search_terms):
     return results
-def generate_current_table_without_cropping(clm_idx, clmn_name, df):
   selected_df = df.iloc[:, clm_idx]
   print("hello I generated the selected columns table without cropping")
   selected_df.columns = clmn_name
-  return selected_df
 def crop_rename_table(indices, clmn_name, clmn_idx,df):
   #crop_at = (max(set(indices), key=indices.count)) + 1
@@ -496,6 +502,8 @@ def get_selected_columns_all(dfs, user_patterns):
         #details in the same table
         if len(dfs[i]) >10:
           selected_columns_new = generate_current_table_without_cropping(column_index_list,dfs[i])
           #break
       #IN CELLS
@@ -1670,32 +1678,48 @@ def pick_approach(schedule, plan, searcharray, flag):
     return no_tables, not_found_any_plan
-def mainRun(schedule, plan, searcharray):
-    print("mainRun is RUNNING")
-    no_tables_normal, not_found_any_plan_normal = pick_approach(schedule, plan, searcharray, 1)
-    no_tables_model, not_found_any_plan_model = pick_approach(schedule, plan, searcharray, 2)
-    pick_normal = False
-    pick_model = False
-    if no_tables_model:
-      pick_normal = True
-      #print("choose normal")
-    elif no_tables_normal:
-      pick_model = True
-      #print("choose model")
-    elif no_tables_model and no_tables_normal:
-      print("el etneen bayzeen")
-    else:
-      ## Decide according to the not found labels
-      #print("el etneen shaghaleen")
-      if len(not_found_any_plan_model) > len(not_found_any_plan_normal):
-        #print("choose not_found_any_plan_normal")
         pick_normal = True
-      elif len(not_found_any_plan_model) < len(not_found_any_plan_normal):
         pick_model = True
-        #print("choose not_found_any_plan_model")
-      else: # law ad ba3d choose the older approach (fitz)
-        pick_normal = True
-        #print("choose any")
     #print(type(plan))
     eltype = type(plan)
@@ -1735,10 +1759,14 @@ def mainRun(schedule, plan, searcharray):
         #width_plan = math.ceil(width_plan)
         #height_plan = math.ceil(height_plan)
         for k in range(len(schedule)):
-          if pick_normal:
              dfs = extract_tables(schedule[k])
-          if pick_model:
              dfs = extract_tables_model(schedule[k])
           user_input_this_schedule = searcharray[k]
           for j in range(len(user_input_this_schedule)):
             user_input = user_input_this_schedule[j]
@@ -1758,20 +1786,21 @@ def mainRun(schedule, plan, searcharray):
               print("mafeesh secondary information")
             selected_columns_combined = get_selected_columns_all(dfs, user_input)
-            if selected_columns_combined is None:
-                dfs_normal = extract_tables(schedule[k])
-                column_indices = get_column_indices_from_dfs_normal(dfs_normal, user_input)
-                if column_indices is None:
-                   missing_clmns = check_missing(dfs, user_input)
-                   missing_message = f"{missing_clmns} can't be extracted from table input {j+1} in schedule {k+1}"
-                   missings.append(missing_message)
-                   continue # continue to the next user input
-                if len(dfs) == 1:
-                  selected_columns_combined = get_selected_columns_by_index(dfs[0], column_indices, user_input)
-                if len(dfs) > 1:
-                  index_df = get_df_index(dfs, user_input)
-                  selected_columns_combined = get_selected_columns_by_index(dfs[index_df], column_indices, user_input)
             selected_columns_combined = selected_columns_combined.applymap(lambda x: 'N/A' if isinstance(x, str) and x.strip() == '' else x)
             selected_columns_combined = selected_columns_combined.fillna('N/A')
             selected_columns_combined = selected_columns_combined.replace(r'(?i)\bn/a\b', 'N/A', regex=True)
@@ -1802,6 +1831,8 @@ def mainRun(schedule, plan, searcharray):
             df_points = grouped_to_dataframe_dynamic(col_dict,
                                          drop_empty_locations=True,
                                          explode_locations=True)
             # handling no door type in the new dictionary logic
             if 'color' not in df_points:
               df_points['color'] = (0, 0, 255)
@@ -1809,24 +1840,25 @@ def mainRun(schedule, plan, searcharray):
             repeated_ids = dupes[dupes > 1].index.to_list()
             repeated_labels_list.append(repeated_ids)
-            if kelma:
-              lst_st_op = df_points["structural_opening"].tolist()
-              cleaned_st_op = get_cleaned_width(lst_st_op)
-              widths, heights = get_widths_bb_format_st_op(cleaned_st_op, kelma)
-              # remove a column (returns a new df)
-              df_points = df_points.drop(columns=['structural_opening'])
-              # add two columns (scalars, lists/arrays/Series of length len(df), or expressions)
-              df_points['width'] = widths          # e.g., a list/Series/np.array or a scalar
-              df_points['height'] = heights
-            else:
-              lst_width = df_points["width"].tolist()
-              lst_height = df_points["height"].tolist()
-              clean_widths, clean_height = get_width_clean_width_height(lst_width, lst_height)
-              df_points["width"] = clean_widths
-              df_points["height"] = clean_height
-            df_points = df_points.rename(columns={'width': 'Width_', 'height':'Height_'})
             #if kelma == None:
                 #widths, secondary_tobeprinted = get_width_info_tobeprinted_secondary(new_data3, main_info, secondary_info)
             #else:

 from xml.etree.ElementTree import Element, SubElement, tostring
 from azure.ai.formrecognizer import DocumentAnalysisClient
 from azure.core.credentials import AzureKeyCredential
+import chardet
 def convert2img(path):
     pdf = pdfium.PdfDocument(path)
     return results
+"""def generate_current_table_without_cropping(clm_idx, clmn_name, df):
   selected_df = df.iloc[:, clm_idx]
   print("hello I generated the selected columns table without cropping")
   selected_df.columns = clmn_name
+  return selected_df"""
+def generate_current_table_without_cropping(clm_idx,df):
+  selected_df = df.iloc[:, clm_idx]
+  print("hello I generated the selected columns table without cropping")
+  return selected_df
 def crop_rename_table(indices, clmn_name, clmn_idx,df):
   #crop_at = (max(set(indices), key=indices.count)) + 1
         #details in the same table
         if len(dfs[i]) >10:
           selected_columns_new = generate_current_table_without_cropping(column_index_list,dfs[i])
+          selected_columns_new.columns = clmn_name  # must match number of columns
           #break
       #IN CELLS
     return no_tables, not_found_any_plan
+def get_df_csv(sch):
+  with open(sch, "rb") as f:
+    raw = f.read(100_000)  # read first 100 KB (enough for detection)
+  guess = chardet.detect(raw)
+  #print(guess)              # {'encoding': 'Windows-1252', 'confidence': 0.73, ...}
+  encoding = guess["encoding"] or "utf-8"  # fallback
+  df = pd.read_csv(sch, encoding=encoding)
+  return df
+def mainRun(schedule, plan, searcharray, sch_csv_pdf):
+    if sch_csv_pdf:
+      print("shcedule type is PDF")
+      no_tables_normal, not_found_any_plan_normal = pick_approach(schedule, plan, searcharray, 1)
+      no_tables_model, not_found_any_plan_model = pick_approach(schedule, plan, searcharray, 2)
+      pick_normal = False
+      pick_model = False
+      if no_tables_model:
         pick_normal = True
+        #print("choose normal")
+      elif no_tables_normal:
         pick_model = True
+        #print("choose model")
+      elif no_tables_model and no_tables_normal:
+        print("el etneen bayzeen")
+      else:
+        ## Decide according to the not found labels
+        #print("el etneen shaghaleen")
+        if len(not_found_any_plan_model) > len(not_found_any_plan_normal):
+          #print("choose not_found_any_plan_normal")
+          pick_normal = True
+        elif len(not_found_any_plan_model) < len(not_found_any_plan_normal):
+          pick_model = True
+          #print("choose not_found_any_plan_model")
+        else: # law ad ba3d choose the older approach (fitz)
+          pick_normal = True
+          #print("choose any")
+    else:
+       print("schedule type is CSV")
+       df = get_df_csv(schedule[0])
+       print(df)
+    print("mainRun is RUNNING")
     #print(type(plan))
     eltype = type(plan)
         #width_plan = math.ceil(width_plan)
         #height_plan = math.ceil(height_plan)
         for k in range(len(schedule)):
+          if sch_csv_pdf and pick_normal:
              dfs = extract_tables(schedule[k])
+          if sch_csv_pdf and pick_model:
              dfs = extract_tables_model(schedule[k])
+          if sch_csv_pdf == False:
+            df = get_df_csv(schedule[k])
+            dfs = [df]
           user_input_this_schedule = searcharray[k]
           for j in range(len(user_input_this_schedule)):
             user_input = user_input_this_schedule[j]
               print("mafeesh secondary information")
             selected_columns_combined = get_selected_columns_all(dfs, user_input)
+            if sch_csv_pdf:
+              if selected_columns_combined is None:
+                  dfs_normal = extract_tables(schedule[k])
+                  column_indices = get_column_indices_from_dfs_normal(dfs_normal, user_input)
+                  if column_indices is None:
+                    missing_clmns = check_missing(dfs, user_input)
+                    missing_message = f"{missing_clmns} can't be extracted from table input {j+1} in schedule {k+1}"
+                    missings.append(missing_message)
+                    continue # continue to the next user input
+                  if len(dfs) == 1:
+                    selected_columns_combined = get_selected_columns_by_index(dfs[0], column_indices, user_input)
+                  if len(dfs) > 1:
+                    index_df = get_df_index(dfs, user_input)
+                    selected_columns_combined = get_selected_columns_by_index(dfs[index_df], column_indices, user_input)
             selected_columns_combined = selected_columns_combined.applymap(lambda x: 'N/A' if isinstance(x, str) and x.strip() == '' else x)
             selected_columns_combined = selected_columns_combined.fillna('N/A')
             selected_columns_combined = selected_columns_combined.replace(r'(?i)\bn/a\b', 'N/A', regex=True)
             df_points = grouped_to_dataframe_dynamic(col_dict,
                                          drop_empty_locations=True,
                                          explode_locations=True)
+            df_points.columns = df_points.columns.str.strip().str.replace(r"\s+", "_", regex=True)
             # handling no door type in the new dictionary logic
             if 'color' not in df_points:
               df_points['color'] = (0, 0, 255)
             repeated_ids = dupes[dupes > 1].index.to_list()
             repeated_labels_list.append(repeated_ids)
+            if ('width' in df_points and 'height' in df_points) or 'structural_opening' in df_points:
+              if kelma:
+                lst_st_op = df_points["structural_opening"].tolist()
+                cleaned_st_op = get_cleaned_width(lst_st_op)
+                widths, heights = get_widths_bb_format_st_op(cleaned_st_op, kelma)
+                # remove a column (returns a new df)
+                df_points = df_points.drop(columns=['structural_opening'])
+                # add two columns (scalars, lists/arrays/Series of length len(df), or expressions)
+                df_points['width'] = widths          # e.g., a list/Series/np.array or a scalar
+                df_points['height'] = heights
+              else:
+                lst_width = df_points["width"].tolist()
+                lst_height = df_points["height"].tolist()
+                clean_widths, clean_height = get_width_clean_width_height(lst_width, lst_height)
+                df_points["width"] = clean_widths
+                df_points["height"] = clean_height
+              df_points = df_points.rename(columns={'width': 'Width_', 'height':'Height_'})
             #if kelma == None:
                 #widths, secondary_tobeprinted = get_width_info_tobeprinted_secondary(new_data3, main_info, secondary_info)
             #else: