Spaces:

XPMaster
/

data_automation

Sleeping

App Files Files Community

XPMaster commited on Sep 12, 2023

Commit

a35ed6e

1 Parent(s): c4a35dc

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -15

app.py CHANGED Viewed

@@ -198,10 +198,14 @@ def fill_missing_quarters(df, lob, acc, transaction):
     print('Number of NaN values in', transaction, ':', df[transaction].isna().sum())
     for col in columns_to_convert:
       df[col] = df[col].apply(lambda x: str(int(x)) if isinstance(x, (int, float)) and str(x) != 'nan' else str(x))
     quarters = []
     start_year = 2017
     end_year = min(int(df[acc].max()[:4]), 2025)
     for year in range(start_year, end_year+1):
         for quarter in ['03', '06', '09', '12']:
             quarters.append(str(year) + quarter)
@@ -213,18 +217,21 @@ def fill_missing_quarters(df, lob, acc, transaction):
         l_missing_df = pd.DataFrame({acc: list(l_quarters),
                                       transaction: [str(end_year)+'12'] * len(l_quarters)})
         for col in df.columns: # Fill the missing
             if col != lob: # These two checks are nesscary in case we are filling for the premium then we only fill it with the missing quarters without the 202212 for transactions
                 if col == acc:
                     l_missing_df[col] = list(l_quarters)
-                elif col == transaction:
-                    l_missing_df[col] = ['202212'] * len(l_quarters)
                 else:
                     l_missing_df[col] = 0.1
         if len(l_quarters) > 0 :
-          print(l,'was filled with the dates',l_quarters)
         l_missing_df[lob] = l
         missing_quarters.append(l_missing_df)
     print('Unique values in', acc, 'for missing quarters:', l_missing_df[acc].unique())
     # Concatenate the original dataframe and the missing quarters dataframe
     filled_df = pd.concat([df] + missing_quarters, ignore_index=True)
@@ -235,15 +242,15 @@ def fill_missing_quarters(df, lob, acc, transaction):
     filled_df[acc] = pd.to_datetime(filled_df[acc], format='%Y%m').dt.strftime('%Y%m')
     print('Unique values in', acc, 'after conversion:', filled_df[acc].unique())
     # Sort the dataframe by quarter
     filled_df = filled_df.sort_values(acc)
     # Reset the index
     filled_df = filled_df.reset_index(drop=True)
     # Print the filled quarters or a message if there are no missing quarters
     filled_quarters = filled_df[acc].unique()
-    filtered_quarters = [q for q in filled_quarters if q[:4] in [str(year1) for year1 in range(start_year, end_year + 1)]]
-    if len(filtered_quarters) == 0:
         print("No missing quarters between 2017-2022")
     else:
         pass#print(filtered_quarters)
@@ -252,9 +259,13 @@ def fill_missing_quarters(df, lob, acc, transaction):
     return filled_df
 def drop_missing_rows(df, columns):
     removed_rows = df[df[columns].isnull().any(axis=1)]
     #display(removed_rows)
     removed_rows = df[df[columns].isnull().any(axis=1)].dropna(subset=columns[0], how='any')
     df = df.dropna(subset=columns, how='any')
     return df,removed_rows
@@ -296,7 +307,11 @@ def get_alts(atype):
     return ['lob','accident_quarter_bracket','transaction_quarter_bracket','paid_amount','gross_recoveries_settled','os_amount','gross_os_recoveries','claim_count']
   return ['lob','quarter_bracket','gross_premium_earned','ERP']
-def filter_claims(df):
   global warnings
   warnings = []
   columns = []
@@ -307,11 +322,22 @@ def filter_claims(df):
     return None,None
   # Find quarters
   sublist = quarters(df)
   columns.extend(sublist)
-  min_col = min(sublist, key=lambda col: df.dropna()[col].sum())
-  max_col = max(sublist, key=lambda col: df.dropna()[col].sum())
   df,temp = drop_missing_rows(df,columns)
   df = fill_missing_quarters(df,columns[0],min_col,max_col)
   df = col_to_ints(df,sublist)
   #df = df[[min_col, max_col] + [col for col in df.columns if col not in [min_col, max_col]]]
   #display(df)
@@ -320,7 +346,7 @@ def filter_claims(df):
   # Rearrange the columns list
   if min_col_index > max_col_index:
       columns.insert(max_col_index, columns.pop(min_col_index))
   is_found(columns,"quarters")
   # Find paid amount
   columns.append(get_paid_amount(df))
@@ -340,10 +366,12 @@ def filter_claims(df):
   # Warn
   for i,w in enumerate(warnings):
     print(str(i+1)+'-',w)
-  df = pd.concat([df, temp], ignore_index=True)
   df = df.replace('nan',0)
   df = df.fillna({col: 0 for col in df.columns if col not in sublist})
-  return df,columns
 def filter_premiums(df):
   global warnings
@@ -433,13 +461,13 @@ def process(files,button):
         return None, msg
     names = unzip_files(files.name)
     sheet_data = dict()
     for name in names:
         #name = os.path.basename(name)
         if valid(name):
                 # return zip_files([files.name]),'Success'+passe
             columns = []
             replacens = dict()
             print("Processing:", name)
@@ -455,6 +483,7 @@ def process(files,button):
             print(old_olds)
             if "summ" in name:
                 df,columns = filter_premiums(df)
                 if columns == None:
                     print(name,'has no LOB column')
@@ -463,7 +492,8 @@ def process(files,button):
                     continue
                 altnames = get_alts('summ')
             else:
-                df,columns = filter_claims(df)
                 if columns == None:
                     print(name,'has no LOB column')
                     print("--"*50)
@@ -483,6 +513,12 @@ def process(files,button):
             df, msg = map_names(df,name)
             df = df[columns]
             column_mapping = dict(zip(columns, finalnames))
             df = df.rename(columns=column_mapping)

     print('Number of NaN values in', transaction, ':', df[transaction].isna().sum())
     for col in columns_to_convert:
       df[col] = df[col].apply(lambda x: str(int(x)) if isinstance(x, (int, float)) and str(x) != 'nan' else str(x))
     quarters = []
     start_year = 2017
+    # df_temp = df.copy(deep=True)
+    # df_temp = df_temp.dropna()
     end_year = min(int(df[acc].max()[:4]), 2025)
+    print("the end year", end_year)
+    print("safe and sound")
     for year in range(start_year, end_year+1):
         for quarter in ['03', '06', '09', '12']:
             quarters.append(str(year) + quarter)
         l_missing_df = pd.DataFrame({acc: list(l_quarters),
                                       transaction: [str(end_year)+'12'] * len(l_quarters)})
         for col in df.columns: # Fill the missing
+            #print("\n"*5,col,transaction)
             if col != lob: # These two checks are nesscary in case we are filling for the premium then we only fill it with the missing quarters without the 202212 for transactions
                 if col == acc:
                     l_missing_df[col] = list(l_quarters)
+                elif str(col) == str(transaction):
+                    l_missing_df[col] = [str(end_year) + '12'] * len(l_quarters)
                 else:
                     l_missing_df[col] = 0.1
         if len(l_quarters) > 0 :
+            print(l,'was filled with the dates',l_quarters)
         l_missing_df[lob] = l
         missing_quarters.append(l_missing_df)
+    print("=="*100)
     print('Unique values in', acc, 'for missing quarters:', l_missing_df[acc].unique())
     # Concatenate the original dataframe and the missing quarters dataframe
     filled_df = pd.concat([df] + missing_quarters, ignore_index=True)
     filled_df[acc] = pd.to_datetime(filled_df[acc], format='%Y%m').dt.strftime('%Y%m')
     print('Unique values in', acc, 'after conversion:', filled_df[acc].unique())
+    print("=="*100)
     # Sort the dataframe by quarter
     filled_df = filled_df.sort_values(acc)
     # Reset the index
     filled_df = filled_df.reset_index(drop=True)
     # Print the filled quarters or a message if there are no missing quarters
     filled_quarters = filled_df[acc].unique()
+    #filtered_quarters = [q for q in filled_quarters if q[:4] in [str(year1) for year1 in range(start_year, end_year + 1)]]
+    if False:#len(filtered_quarters) == 0:
         print("No missing quarters between 2017-2022")
     else:
         pass#print(filtered_quarters)
     return filled_df
 def drop_missing_rows(df, columns):
+    #import sys
     removed_rows = df[df[columns].isnull().any(axis=1)]
     #display(removed_rows)
+    print("LOB NAME", columns[0])
+    #sys.exit()
     removed_rows = df[df[columns].isnull().any(axis=1)].dropna(subset=columns[0], how='any')
+    removed_rows = removed_rows[removed_rows[columns].isnull().any(axis=1)].dropna(subset=columns[0], how='any')
     df = df.dropna(subset=columns, how='any')
     return df,removed_rows
     return ['lob','accident_quarter_bracket','transaction_quarter_bracket','paid_amount','gross_recoveries_settled','os_amount','gross_os_recoveries','claim_count']
   return ['lob','quarter_bracket','gross_premium_earned','ERP']
+def filter_claims(df):
+  print("Sum of Null beginning: ",df.isnull().sum())
+  print("Sum of Null beginning 2: ",(df == '').sum())
+  print(df.dtypes)
   global warnings
   warnings = []
   columns = []
     return None,None
   # Find quarters
   sublist = quarters(df)
+  print("\n"*10,sublist,"\n"*10)
   columns.extend(sublist)
+  # min_col = min(sublist, key=lambda col: df.dropna()[col].sum())
+  # max_col = max(sublist, key=lambda col: df.dropna()[col].sum())
+  min_col = df[sublist].sum().idxmin()
+  max_col = [col for col in sublist if col != min_col][0]
   df,temp = drop_missing_rows(df,columns)
+  print('missing: ',df[df.columns[1]].isnull().sum())
+  #df.to_csv("gayassshit.csv")
+  #temp.to_csv("gayassshit1.csv")
+  #df.to_csv("before_filling.csv")
+  #print("\n"*10,columns[0],min_col,max_col,"\n"*10)
   df = fill_missing_quarters(df,columns[0],min_col,max_col)
+  #df.to_csv("after_filling.csv")
+  #print(columns[0],min_col,max_col)
+  #temp = fill_missing_quarters(temp,columns[0],min_col,max_col)
   df = col_to_ints(df,sublist)
   #df = df[[min_col, max_col] + [col for col in df.columns if col not in [min_col, max_col]]]
   #display(df)
   # Rearrange the columns list
   if min_col_index > max_col_index:
       columns.insert(max_col_index, columns.pop(min_col_index))
   is_found(columns,"quarters")
   # Find paid amount
   columns.append(get_paid_amount(df))
   # Warn
   for i,w in enumerate(warnings):
     print(str(i+1)+'-',w)
+  #df = pd.concat([df, temp], ignore_index=True)
   df = df.replace('nan',0)
   df = df.fillna({col: 0 for col in df.columns if col not in sublist})
+  return df,columns,temp
 def filter_premiums(df):
   global warnings
         return None, msg
     names = unzip_files(files.name)
     sheet_data = dict()
     for name in names:
         #name = os.path.basename(name)
         if valid(name):
                 # return zip_files([files.name]),'Success'+passe
+            temp = None
             columns = []
             replacens = dict()
             print("Processing:", name)
             print(old_olds)
             if "summ" in name:
+                print("Summary:")
                 df,columns = filter_premiums(df)
                 if columns == None:
                     print(name,'has no LOB column')
                     continue
                 altnames = get_alts('summ')
             else:
+                print("Claims:")
+                df,columns,temp = filter_claims(df)
                 if columns == None:
                     print(name,'has no LOB column')
                     print("--"*50)
             df, msg = map_names(df,name)
             df = df[columns]
+            print("temp",temp)
+            if isinstance(temp,pd.DataFrame):
+                temp, _ = map_names(temp,name)
+                temp = temp[columns]
+                temp = temp[temp.iloc[:, 3:].sum(axis=1) != 0]
+                df = pd.concat([df, temp], ignore_index=True)
             column_mapping = dict(zip(columns, finalnames))
             df = df.rename(columns=column_mapping)