Sambit20030731 commited on
Commit
ee296bf
·
verified ·
1 Parent(s): 2bd029d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -86
app.py CHANGED
@@ -1,23 +1,27 @@
 
1
  import pathlib
2
  import textwrap
3
- import tempfile
4
- import gradio as gr
5
  import pandas as pd
6
  import numpy as np
 
 
7
  from fuzzywuzzy import fuzz
8
  from openpyxl import load_workbook
9
  from openpyxl.styles import PatternFill
 
10
  import google.generativeai as genai
11
  from IPython.display import display
12
- from IPython.display import Markdown
13
- from openpyxl.styles.alignment import Alignment
14
 
 
 
15
  GOOGLE_API_KEY='AIzaSyCtACPu9EOnEa1_iAWsv_u__PQRpaCT564'
16
  genai.configure(api_key=GOOGLE_API_KEY)
 
 
 
17
  model = genai.GenerativeModel('gemini-1.0-pro')
18
- def to_markdown(text):
19
- text = text.replace('•', ' *')
20
- return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
21
  # Function to apply to df1 to create the cont_person_name column
22
  def process_fuzzy_ratios(rows_dict):
23
  fuzz_data = {}
@@ -41,11 +45,14 @@ def process_fuzzy_ratios(rows_dict):
41
  "postal_fuzzy_ratio": row.pop("postal_fuzzy_ratio")
42
  }
43
  return fuzz_data, rows_dict
 
 
44
  def gemini_analysis(dataframe):
45
  prev_row_duplicate = False
46
  prev_row_number = None
47
-
48
  for index, row in dataframe.iterrows():
 
 
49
  if row['Remarks'] == 'Duplicate':
50
  if prev_row_duplicate:
51
  duplicate_pairs=[]
@@ -61,6 +68,8 @@ def gemini_analysis(dataframe):
61
  main_data_str = "[{}]".format(', '.join([str(d) for d in duplicate_pairs]))
62
  fuzzy_data_str = "{}".format(fuzzy_ratios)
63
  qs="I have the data",main_data_str,"The corresponding fuzzy ratios are here: ",fuzzy_data_str,"Give a concise explanation why these two rows are duplicate based on analyzing the main data and explaining which column values are same and which column values are different?"
 
 
64
  try:
65
  response = model.generate_content(qs)
66
  dataframe.at[index-1, 'Explanation'] = response.text
@@ -70,6 +79,8 @@ def gemini_analysis(dataframe):
70
  print(f"ValueError occurred: {ve}")
71
  except Exception as ex:
72
  print(f"An error occurred: {ex}")
 
 
73
  dataframe.at[index-1, 'Explanation'] = response.text
74
  prev_row_duplicate = True
75
  prev_row_number = index
@@ -77,75 +88,54 @@ def gemini_analysis(dataframe):
77
  prev_row_duplicate = False
78
  prev_row_number = None
79
 
 
 
80
  def process_csv(file, remove_null_columns):
81
  sheet_name1 = 'General Data '
82
  sheet_name2 = 'Contact Person'
 
 
83
  df = pd.read_excel(file, sheet_name=sheet_name1,engine='openpyxl')
84
  # Replace null values with a blank space
85
  df=df.fillna(" ")
86
- df1 = pd.read_excel(file, sheet_name=sheet_name2)
 
87
  # Replace null values with a blank space
88
  df1 = df1.fillna(" ")
 
89
  # Creating new columns by concatenating original columns
90
  df['Address'] = df['STREET'].astype(str) +'-'+ df['CITY1'].astype(str) +'-'+ df['COUNTRY'].astype(str) + '-' + df['REGION'].astype(str)
91
  df['Name'] = df['NAMEFIRST'].astype(str)+'-'+ df['NAMELAST'].astype(str) +'-'+ df['NAME3'].astype(str) + '-' + df['NAME4'].astype(str)
92
  df['Bank'] = df['BANKL'].astype(str)+'-'+df['BANKN'].astype(str)
93
  df['Tax'] = df['TAXTYPE'].astype(str)+'-'+df['TAXNUM'].astype(str)
94
- df1['cont_person_name'] = df1['PARNR'].astype(str)+'-'+ df1['VNAME'].astype(str) +'-'+ df1['LNAME'].astype(str)
95
- df1['cont_person_address'] = df1['COUNTRY'].astype(str) +'-'+ df1['REGION'].astype(str) +'-'+ df1['POSTLCD'].astype(str) +'-'+ df1['CITY'].astype(str) + '-' + df1['STREET'].astype(str)
96
 
97
  # Converting all concatenated columns to lowercase
98
  df['Name']=df['Name'].str.lower()
99
  df['Address']=df['Address'].str.lower()
100
  df['Bank']=df['Bank'].str.lower()
101
  df['Tax']=df['Tax'].str.lower()
102
- df1['cont_person_name']=df1['cont_person_name'].str.lower()
103
- df1['cont_person_address']=df1['cont_person_address'].str.lower()
104
- #Adding contact_person_name and address to sheet1(General Data)
105
-
106
- # Grouping names in df2 based on LIFNR (ID)
107
- grouped_names = df1.groupby("LIFNR")["cont_person_name"].agg(lambda x: ', '.join(x)).reset_index()
108
-
109
- # Create a dictionary mapping LIFNR to concatenated names
110
- name_map = dict(zip(grouped_names["LIFNR"], grouped_names["cont_person_name"]))
111
- def create_cont_person_name(row):
112
- if row["LIFNR"] in name_map:
113
- return name_map[row["LIFNR"]]
114
- else:
115
- return ""
116
-
117
- grouped_names = df1.groupby("LIFNR")["cont_person_address"].agg(lambda x: ', '.join(x)).reset_index()
118
- add_map = dict(zip(grouped_names["LIFNR"], grouped_names["cont_person_address"]))
119
- def create_cont_person_add(row):
120
- if row["LIFNR"] in add_map:
121
- return add_map[row["LIFNR"]]
122
- else:
123
- return ""
124
-
125
- # Apply the function to create the cont_person_name column
126
- df["cont_person_name"] = df.apply(create_cont_person_name, axis=1)
127
- df["cont_person_address"] = df.apply(create_cont_person_add, axis=1)
128
  df['name_fuzzy_ratio']=''
129
  df['accgrp_fuzzy_ratio']=''
130
  df['address_fuzzy_ratio']=''
131
  df['bank_fuzzy_ratio']=''
132
  df['tax_fuzzy_ratio']=''
133
  df['postal_fuzzy_ratio']=''
134
- df1['cont_person_name_fuzzy_ratio']=''
135
- df1['cont_person_address_fuzzy_ratio']=''
136
 
 
137
  df['name_based_group']=''
138
  df['accgrp_based_group']=''
139
  df['address_based_group']=''
140
  df['bank_based_group']=''
141
  df['tax_based_group']=''
142
  df['postal_based_group']=''
143
- df1['cont_person_name_based_group']=''
144
- df1['cont_person_address_based_group']=''
145
 
 
146
  last_row_index = len(df)-1
147
  last_row_index1 = len(df1)-1
148
 
 
149
  df.sort_values(['Tax'], inplace=True)
150
  df = df.reset_index(drop=True)
151
  df.at[0,'tax_fuzzy_ratio']=100
@@ -155,12 +145,11 @@ def process_csv(file, remove_null_columns):
155
  previous_tax = df['Tax'].iloc[i-1]
156
  fuzzy_ratio = fuzz.ratio(previous_tax,current_tax)
157
  df.at[i,'tax_fuzzy_ratio'] = fuzzy_ratio
158
-
159
  df['tax_fuzzy_ratio'] = pd.to_numeric(df['tax_fuzzy_ratio'], errors='coerce')
160
 
 
161
  group_counter = 1
162
  df.at[0,'tax_based_group'] = group_counter
163
-
164
  for i in range (1, len(df)):
165
  if df.at[i,'tax_fuzzy_ratio'] > 90:
166
  df.at[i,'tax_based_group'] = df.at[i-1,'tax_based_group']
@@ -169,6 +158,7 @@ def process_csv(file, remove_null_columns):
169
  df.at[i,'tax_based_group'] = group_counter
170
  group = df.at[0,'tax_based_group']
171
 
 
172
  df.sort_values(['tax_based_group','Bank'], inplace=True)
173
  df = df.reset_index(drop=True)
174
  df.at[0,'bank_fuzzy_ratio']=100
@@ -178,25 +168,25 @@ def process_csv(file, remove_null_columns):
178
  previous_address = df['Bank'].iloc[i-1]
179
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
180
  df.at[i,'bank_fuzzy_ratio'] = fuzzy_ratio
181
-
182
  df['bank_fuzzy_ratio'] = pd.to_numeric(df['bank_fuzzy_ratio'], errors='coerce')
183
 
184
- address_group_counter = 1
185
- df.at[0,'bank_based_group'] = str(address_group_counter)
186
-
187
  for i in range(1,len(df)):
188
  if df.at[i,'bank_fuzzy_ratio'] >= 100:
189
  df.at[i,'bank_based_group'] = df.at[i-1, 'bank_based_group']
190
  else:
191
  if df.at[i,'tax_based_group'] != group:
192
- address_group_counter = 1
193
  group = df.at[i,'tax_based_group']
194
  else:
195
- address_group_counter +=1
196
- df.at[i,'bank_based_group'] = str(address_group_counter)
197
  df['Group_tax_bank'] = df.apply(lambda row: '{}_{}'.format(row['tax_based_group'], row['bank_based_group']), axis = 1)
198
  group = df.at[0,'Group_tax_bank']
199
 
 
200
  df.sort_values(['Group_tax_bank','Address'], inplace=True)
201
  df = df.reset_index(drop=True)
202
  df.at[0,'address_fuzzy_ratio']=100
@@ -206,12 +196,11 @@ def process_csv(file, remove_null_columns):
206
  previous_address = df['Address'].iloc[i-1]
207
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
208
  df.at[i,'address_fuzzy_ratio'] = fuzzy_ratio
209
-
210
  df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')
211
 
 
212
  address_group_counter = 1
213
  df.at[0,'address_based_group'] = str(address_group_counter)
214
-
215
  for i in range(1,len(df)):
216
  if df.at[i,'address_fuzzy_ratio'] > 70:
217
  df.at[i,'address_based_group'] = df.at[i-1, 'address_based_group']
@@ -225,6 +214,7 @@ def process_csv(file, remove_null_columns):
225
  df['Group_tax_bank_add'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank'], row['address_based_group']), axis = 1)
226
  group = df.at[0,'Group_tax_bank_add']
227
 
 
228
  df.sort_values(['Group_tax_bank_add','Name'], inplace=True)
229
  df = df.reset_index(drop=True)
230
  df.at[0,'name_fuzzy_ratio']=100
@@ -234,25 +224,25 @@ def process_csv(file, remove_null_columns):
234
  previous_address = df['Name'].iloc[i-1]
235
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
236
  df.at[i,'name_fuzzy_ratio'] = fuzzy_ratio
237
-
238
  df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')
239
 
240
- address_group_counter = 1
241
- df.at[0,'name_based_group'] = str(address_group_counter)
242
-
243
  for i in range(1,len(df)):
244
  if df.at[i,'name_fuzzy_ratio'] > 80:
245
  df.at[i,'name_based_group'] = df.at[i-1, 'name_based_group']
246
  else:
247
  if df.at[i,'Group_tax_bank_add'] != group:
248
- address_group_counter = 1
249
  group = df.at[i,'Group_tax_bank_add']
250
  else:
251
- address_group_counter +=1
252
- df.at[i,'name_based_group'] = str(address_group_counter)
253
  df['Group_tax_bank_add_name'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank_add'], row['name_based_group']), axis = 1)
254
  group = df.at[0,'Group_tax_bank_add_name']
255
 
 
256
  df.sort_values(['Group_tax_bank_add_name','POSTCODE1'], inplace=True)
257
  df = df.reset_index(drop=True)
258
  df.at[0,'postal_fuzzy_ratio']=100
@@ -262,25 +252,25 @@ def process_csv(file, remove_null_columns):
262
  previous_address = df['POSTCODE1'].iloc[i-1]
263
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
264
  df.at[i,'postal_fuzzy_ratio'] = fuzzy_ratio
265
-
266
  df['postal_fuzzy_ratio'] = pd.to_numeric(df['postal_fuzzy_ratio'], errors='coerce')
267
 
268
- address_group_counter = 1
269
- df.at[0,'postal_based_group'] = str(address_group_counter)
270
-
271
  for i in range(1,len(df)):
272
  if df.at[i,'postal_fuzzy_ratio'] > 90:
273
  df.at[i,'postal_based_group'] = df.at[i-1, 'postal_based_group']
274
  else:
275
  if df.at[i,'Group_tax_bank_add_name'] != group:
276
- address_group_counter = 1
277
  group = df.at[i,'Group_tax_bank_add_name']
278
  else:
279
- address_group_counter +=1
280
- df.at[i,'postal_based_group'] = str(address_group_counter)
281
  df['Group_tax_bank_add_name_post'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank_add_name'], row['postal_based_group']), axis = 1)
282
  group = df.at[0,'Group_tax_bank_add_name_post']
283
 
 
284
  df.sort_values(['Group_tax_bank_add_name_post','KTOKK'], inplace=True)
285
  df = df.reset_index(drop=True)
286
  df.at[0,'accgrp_fuzzy_ratio']=100
@@ -290,46 +280,44 @@ def process_csv(file, remove_null_columns):
290
  previous_address = df['KTOKK'].iloc[i-1]
291
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
292
  df.at[i,'accgrp_fuzzy_ratio'] = fuzzy_ratio
293
-
294
  df['accgrp_fuzzy_ratio'] = pd.to_numeric(df['accgrp_fuzzy_ratio'], errors='coerce')
295
 
296
- address_group_counter = 1
297
- df.at[0,'accgrp_based_group'] = str(address_group_counter)
298
-
299
  for i in range(1,len(df)):
300
  if df.at[i,'accgrp_fuzzy_ratio'] >=100:
301
  df.at[i,'accgrp_based_group'] = df.at[i-1, 'accgrp_based_group']
302
  else:
303
  if df.at[i,'Group_tax_bank_add_name_post'] != group:
304
- address_group_counter = 1
305
  group = df.at[i,'Group_tax_bank_add_name_post']
306
  else:
307
- address_group_counter +=1
308
- df.at[i,'accgrp_based_group'] = str(address_group_counter)
309
  df['Group_tax_bank_add_name_post_accgrp'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank_add_name_post'], row['accgrp_based_group']), axis = 1)
310
  group = df.at[0,'Group_tax_bank_add_name_post_accgrp']
311
 
 
312
  duplicate_groups = df['Group_tax_bank_add_name_post_accgrp'].duplicated(keep=False)
313
  df['Remarks'] = ['Duplicate' if is_duplicate else 'Unique' for is_duplicate in duplicate_groups]
314
 
315
-
316
  df.replace(" ", np.nan, inplace=True)
317
  nan_percentage = df.isna().mean(axis=0)
318
-
319
- # Filter columns with more than 70% NaN values
320
  columns_to_drop = nan_percentage[nan_percentage > 0.7].index
321
  if remove_null_columns=='Yes':
322
  df.drop(columns=columns_to_drop, inplace=True)
323
  df.replace(np.nan, " ", inplace=True)
324
 
325
-
326
- # Call the function with your DataFrame
327
  gemini_analysis(df)
328
 
 
329
  columns_to_drop = ['name_fuzzy_ratio','accgrp_fuzzy_ratio','address_fuzzy_ratio','bank_fuzzy_ratio','tax_fuzzy_ratio','postal_fuzzy_ratio','name_based_group','accgrp_based_group','address_based_group','bank_based_group','tax_based_group','postal_based_group','Group_tax_bank','Group_tax_bank_add', 'Group_tax_bank_add_name', 'Group_tax_bank_add_name_post']
330
  df = df.drop(columns=columns_to_drop, axis=1)
331
 
332
-
333
  with tempfile.NamedTemporaryFile(prefix="Outputs", suffix=".xlsx", delete=False) as temp_file:
334
  df.to_excel(temp_file.name, index=False)
335
  excel_writer = pd.ExcelWriter(temp_file.name, engine='openpyxl')
@@ -339,32 +327,31 @@ def process_csv(file, remove_null_columns):
339
  workbook = excel_writer.book
340
  worksheet = workbook['Sheet1']
341
 
342
- # Apply row coloring based on the value in the 'Remarks' column
343
  duplicate_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
344
-
345
  for idx, row in df.iterrows():
346
  if row['Remarks'] == 'Duplicate':
347
  for cell in worksheet[idx + 2]:
348
  cell.alignment = Alignment(wrap_text=True)
349
  cell.fill = duplicate_fill
350
 
351
- # Iterate over columns and set their width based on a specific calculation
352
  for col in worksheet.columns:
353
  col_letter = col[0].column_letter
354
  worksheet.column_dimensions[col_letter].width = 28
355
 
356
- # Iterate over rows and set their height based on a specific calculation
357
  for row in worksheet.iter_rows():
358
- worksheet.row_dimensions[row[0].row].height = 20 # Set the row height to 25 (adjust as needed)
359
 
360
  # Save the changes
361
  excel_writer.close()
362
 
363
- print("Excel file saved successfully.")
364
-
365
  return temp_file.name
366
 
367
 
 
368
  interface = gr.Interface(
369
  fn=process_csv,
370
  inputs=[
@@ -380,4 +367,5 @@ interface = gr.Interface(
380
  description="Upload a XLSX file and choose which column to check for duplicates."
381
  )
382
 
 
383
  interface.launch(debug=True,share=True)
 
1
+ #import libraries
2
  import pathlib
3
  import textwrap
 
 
4
  import pandas as pd
5
  import numpy as np
6
+ import gradio as gr
7
+ import tempfile
8
  from fuzzywuzzy import fuzz
9
  from openpyxl import load_workbook
10
  from openpyxl.styles import PatternFill
11
+ from openpyxl.styles.alignment import Alignment
12
  import google.generativeai as genai
13
  from IPython.display import display
 
 
14
 
15
+
16
+ #connect to google gemini API key
17
  GOOGLE_API_KEY='AIzaSyCtACPu9EOnEa1_iAWsv_u__PQRpaCT564'
18
  genai.configure(api_key=GOOGLE_API_KEY)
19
+
20
+
21
+ #Load the gemini model
22
  model = genai.GenerativeModel('gemini-1.0-pro')
23
+
24
+
 
25
  # Function to apply to df1 to create the cont_person_name column
26
  def process_fuzzy_ratios(rows_dict):
27
  fuzz_data = {}
 
45
  "postal_fuzzy_ratio": row.pop("postal_fuzzy_ratio")
46
  }
47
  return fuzz_data, rows_dict
48
+
49
+ # Code to perform gemini analysis
50
  def gemini_analysis(dataframe):
51
  prev_row_duplicate = False
52
  prev_row_number = None
 
53
  for index, row in dataframe.iterrows():
54
+
55
+ # Find duplicate pairs
56
  if row['Remarks'] == 'Duplicate':
57
  if prev_row_duplicate:
58
  duplicate_pairs=[]
 
68
  main_data_str = "[{}]".format(', '.join([str(d) for d in duplicate_pairs]))
69
  fuzzy_data_str = "{}".format(fuzzy_ratios)
70
  qs="I have the data",main_data_str,"The corresponding fuzzy ratios are here: ",fuzzy_data_str,"Give a concise explanation why these two rows are duplicate based on analyzing the main data and explaining which column values are same and which column values are different?"
71
+
72
+ # Ask gemini to analyse the data
73
  try:
74
  response = model.generate_content(qs)
75
  dataframe.at[index-1, 'Explanation'] = response.text
 
79
  print(f"ValueError occurred: {ve}")
80
  except Exception as ex:
81
  print(f"An error occurred: {ex}")
82
+
83
+ # Add a new row in excel file to write the explanation
84
  dataframe.at[index-1, 'Explanation'] = response.text
85
  prev_row_duplicate = True
86
  prev_row_number = index
 
88
  prev_row_duplicate = False
89
  prev_row_number = None
90
 
91
+
92
+ # Code for de-duplication
93
  def process_csv(file, remove_null_columns):
94
  sheet_name1 = 'General Data '
95
  sheet_name2 = 'Contact Person'
96
+
97
+ # Read the 1st sheet of excel file
98
  df = pd.read_excel(file, sheet_name=sheet_name1,engine='openpyxl')
99
  # Replace null values with a blank space
100
  df=df.fillna(" ")
101
+ # Read the 2nd sheet of excel file
102
+ df1 = pd.read_excel(file, sheet_name=sheet_name2,engine='openpyxl')
103
  # Replace null values with a blank space
104
  df1 = df1.fillna(" ")
105
+
106
  # Creating new columns by concatenating original columns
107
  df['Address'] = df['STREET'].astype(str) +'-'+ df['CITY1'].astype(str) +'-'+ df['COUNTRY'].astype(str) + '-' + df['REGION'].astype(str)
108
  df['Name'] = df['NAMEFIRST'].astype(str)+'-'+ df['NAMELAST'].astype(str) +'-'+ df['NAME3'].astype(str) + '-' + df['NAME4'].astype(str)
109
  df['Bank'] = df['BANKL'].astype(str)+'-'+df['BANKN'].astype(str)
110
  df['Tax'] = df['TAXTYPE'].astype(str)+'-'+df['TAXNUM'].astype(str)
 
 
111
 
112
  # Converting all concatenated columns to lowercase
113
  df['Name']=df['Name'].str.lower()
114
  df['Address']=df['Address'].str.lower()
115
  df['Bank']=df['Bank'].str.lower()
116
  df['Tax']=df['Tax'].str.lower()
117
+
118
+ # Create new columns with the following names for fuzzy ratio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  df['name_fuzzy_ratio']=''
120
  df['accgrp_fuzzy_ratio']=''
121
  df['address_fuzzy_ratio']=''
122
  df['bank_fuzzy_ratio']=''
123
  df['tax_fuzzy_ratio']=''
124
  df['postal_fuzzy_ratio']=''
 
 
125
 
126
+ # Create new columns with the following names for crearing groups
127
  df['name_based_group']=''
128
  df['accgrp_based_group']=''
129
  df['address_based_group']=''
130
  df['bank_based_group']=''
131
  df['tax_based_group']=''
132
  df['postal_based_group']=''
 
 
133
 
134
+ # Calculate last row index value
135
  last_row_index = len(df)-1
136
  last_row_index1 = len(df1)-1
137
 
138
+ # Calculate the fuzzy ratios for tax column
139
  df.sort_values(['Tax'], inplace=True)
140
  df = df.reset_index(drop=True)
141
  df.at[0,'tax_fuzzy_ratio']=100
 
145
  previous_tax = df['Tax'].iloc[i-1]
146
  fuzzy_ratio = fuzz.ratio(previous_tax,current_tax)
147
  df.at[i,'tax_fuzzy_ratio'] = fuzzy_ratio
 
148
  df['tax_fuzzy_ratio'] = pd.to_numeric(df['tax_fuzzy_ratio'], errors='coerce')
149
 
150
+ # Calculate the duplicate groups based on tax column
151
  group_counter = 1
152
  df.at[0,'tax_based_group'] = group_counter
 
153
  for i in range (1, len(df)):
154
  if df.at[i,'tax_fuzzy_ratio'] > 90:
155
  df.at[i,'tax_based_group'] = df.at[i-1,'tax_based_group']
 
158
  df.at[i,'tax_based_group'] = group_counter
159
  group = df.at[0,'tax_based_group']
160
 
161
+ # Calculate the fuzzy ratios for bank column
162
  df.sort_values(['tax_based_group','Bank'], inplace=True)
163
  df = df.reset_index(drop=True)
164
  df.at[0,'bank_fuzzy_ratio']=100
 
168
  previous_address = df['Bank'].iloc[i-1]
169
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
170
  df.at[i,'bank_fuzzy_ratio'] = fuzzy_ratio
 
171
  df['bank_fuzzy_ratio'] = pd.to_numeric(df['bank_fuzzy_ratio'], errors='coerce')
172
 
173
+ # Calculate the duplicate groups for bank column
174
+ bank_group_counter = 1
175
+ df.at[0,'bank_based_group'] = str(bank_group_counter)
176
  for i in range(1,len(df)):
177
  if df.at[i,'bank_fuzzy_ratio'] >= 100:
178
  df.at[i,'bank_based_group'] = df.at[i-1, 'bank_based_group']
179
  else:
180
  if df.at[i,'tax_based_group'] != group:
181
+ bank_group_counter = 1
182
  group = df.at[i,'tax_based_group']
183
  else:
184
+ bank_group_counter +=1
185
+ df.at[i,'bank_based_group'] = str(bank_group_counter)
186
  df['Group_tax_bank'] = df.apply(lambda row: '{}_{}'.format(row['tax_based_group'], row['bank_based_group']), axis = 1)
187
  group = df.at[0,'Group_tax_bank']
188
 
189
+ # Calculate the fuzzy ratios for address column
190
  df.sort_values(['Group_tax_bank','Address'], inplace=True)
191
  df = df.reset_index(drop=True)
192
  df.at[0,'address_fuzzy_ratio']=100
 
196
  previous_address = df['Address'].iloc[i-1]
197
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
198
  df.at[i,'address_fuzzy_ratio'] = fuzzy_ratio
 
199
  df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')
200
 
201
+ # Calculate the duplicate groups for address column
202
  address_group_counter = 1
203
  df.at[0,'address_based_group'] = str(address_group_counter)
 
204
  for i in range(1,len(df)):
205
  if df.at[i,'address_fuzzy_ratio'] > 70:
206
  df.at[i,'address_based_group'] = df.at[i-1, 'address_based_group']
 
214
  df['Group_tax_bank_add'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank'], row['address_based_group']), axis = 1)
215
  group = df.at[0,'Group_tax_bank_add']
216
 
217
+ # Calculate the fuzzy ratios for name column
218
  df.sort_values(['Group_tax_bank_add','Name'], inplace=True)
219
  df = df.reset_index(drop=True)
220
  df.at[0,'name_fuzzy_ratio']=100
 
224
  previous_address = df['Name'].iloc[i-1]
225
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
226
  df.at[i,'name_fuzzy_ratio'] = fuzzy_ratio
 
227
  df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')
228
 
229
+ # Calculate the duplicate groups for name column
230
+ name_group_counter = 1
231
+ df.at[0,'name_based_group'] = str(name_group_counter)
232
  for i in range(1,len(df)):
233
  if df.at[i,'name_fuzzy_ratio'] > 80:
234
  df.at[i,'name_based_group'] = df.at[i-1, 'name_based_group']
235
  else:
236
  if df.at[i,'Group_tax_bank_add'] != group:
237
+ name_group_counter = 1
238
  group = df.at[i,'Group_tax_bank_add']
239
  else:
240
+ name_group_counter +=1
241
+ df.at[i,'name_based_group'] = str(name_group_counter)
242
  df['Group_tax_bank_add_name'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank_add'], row['name_based_group']), axis = 1)
243
  group = df.at[0,'Group_tax_bank_add_name']
244
 
245
+ # Calculate the fuzzy ratios for postcode column
246
  df.sort_values(['Group_tax_bank_add_name','POSTCODE1'], inplace=True)
247
  df = df.reset_index(drop=True)
248
  df.at[0,'postal_fuzzy_ratio']=100
 
252
  previous_address = df['POSTCODE1'].iloc[i-1]
253
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
254
  df.at[i,'postal_fuzzy_ratio'] = fuzzy_ratio
 
255
  df['postal_fuzzy_ratio'] = pd.to_numeric(df['postal_fuzzy_ratio'], errors='coerce')
256
 
257
+ # Calculate the duplicate groups for postcode column
258
+ postcode_group_counter = 1
259
+ df.at[0,'postal_based_group'] = str(postcode_group_counter)
260
  for i in range(1,len(df)):
261
  if df.at[i,'postal_fuzzy_ratio'] > 90:
262
  df.at[i,'postal_based_group'] = df.at[i-1, 'postal_based_group']
263
  else:
264
  if df.at[i,'Group_tax_bank_add_name'] != group:
265
+ postcode_group_counter = 1
266
  group = df.at[i,'Group_tax_bank_add_name']
267
  else:
268
+ postcode_group_counter +=1
269
+ df.at[i,'postal_based_group'] = str(postcode_group_counter)
270
  df['Group_tax_bank_add_name_post'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank_add_name'], row['postal_based_group']), axis = 1)
271
  group = df.at[0,'Group_tax_bank_add_name_post']
272
 
273
+ # Calculate the fuzzy ratios for accgrp column
274
  df.sort_values(['Group_tax_bank_add_name_post','KTOKK'], inplace=True)
275
  df = df.reset_index(drop=True)
276
  df.at[0,'accgrp_fuzzy_ratio']=100
 
280
  previous_address = df['KTOKK'].iloc[i-1]
281
  fuzzy_ratio = fuzz.ratio(previous_address, current_address)
282
  df.at[i,'accgrp_fuzzy_ratio'] = fuzzy_ratio
 
283
  df['accgrp_fuzzy_ratio'] = pd.to_numeric(df['accgrp_fuzzy_ratio'], errors='coerce')
284
 
285
+ # Calculate the duplicate groups for accgrp column
286
+ accgrp_group_counter = 1
287
+ df.at[0,'accgrp_based_group'] = str(accgrp_group_counter)
288
  for i in range(1,len(df)):
289
  if df.at[i,'accgrp_fuzzy_ratio'] >=100:
290
  df.at[i,'accgrp_based_group'] = df.at[i-1, 'accgrp_based_group']
291
  else:
292
  if df.at[i,'Group_tax_bank_add_name_post'] != group:
293
+ accgrp_group_counter = 1
294
  group = df.at[i,'Group_tax_bank_add_name_post']
295
  else:
296
+ accgrp_group_counter +=1
297
+ df.at[i,'accgrp_based_group'] = str(accgrp_group_counter)
298
  df['Group_tax_bank_add_name_post_accgrp'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank_add_name_post'], row['accgrp_based_group']), axis = 1)
299
  group = df.at[0,'Group_tax_bank_add_name_post_accgrp']
300
 
301
+ # Find the final duplicate groups in AND condition
302
  duplicate_groups = df['Group_tax_bank_add_name_post_accgrp'].duplicated(keep=False)
303
  df['Remarks'] = ['Duplicate' if is_duplicate else 'Unique' for is_duplicate in duplicate_groups]
304
 
305
+ # Filter the columns which have nan values more than 70% and drop based on user requirement
306
  df.replace(" ", np.nan, inplace=True)
307
  nan_percentage = df.isna().mean(axis=0)
 
 
308
  columns_to_drop = nan_percentage[nan_percentage > 0.7].index
309
  if remove_null_columns=='Yes':
310
  df.drop(columns=columns_to_drop, inplace=True)
311
  df.replace(np.nan, " ", inplace=True)
312
 
313
+ # Ask gemini to analyse the duplicate columns
 
314
  gemini_analysis(df)
315
 
316
+ # Drop the columns related to fuzzy ratios and groups
317
  columns_to_drop = ['name_fuzzy_ratio','accgrp_fuzzy_ratio','address_fuzzy_ratio','bank_fuzzy_ratio','tax_fuzzy_ratio','postal_fuzzy_ratio','name_based_group','accgrp_based_group','address_based_group','bank_based_group','tax_based_group','postal_based_group','Group_tax_bank','Group_tax_bank_add', 'Group_tax_bank_add_name', 'Group_tax_bank_add_name_post']
318
  df = df.drop(columns=columns_to_drop, axis=1)
319
 
320
+ # Create a temporary file
321
  with tempfile.NamedTemporaryFile(prefix="Outputs", suffix=".xlsx", delete=False) as temp_file:
322
  df.to_excel(temp_file.name, index=False)
323
  excel_writer = pd.ExcelWriter(temp_file.name, engine='openpyxl')
 
327
  workbook = excel_writer.book
328
  worksheet = workbook['Sheet1']
329
 
330
+ # Apply row coloring based on the value in the 'Remarks' column and also wrap the texts
331
  duplicate_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
 
332
  for idx, row in df.iterrows():
333
  if row['Remarks'] == 'Duplicate':
334
  for cell in worksheet[idx + 2]:
335
  cell.alignment = Alignment(wrap_text=True)
336
  cell.fill = duplicate_fill
337
 
338
+ # Iterate over columns and set their width
339
  for col in worksheet.columns:
340
  col_letter = col[0].column_letter
341
  worksheet.column_dimensions[col_letter].width = 28
342
 
343
+ # Iterate over rows and set their height
344
  for row in worksheet.iter_rows():
345
+ worksheet.row_dimensions[row[0].row].height = 20
346
 
347
  # Save the changes
348
  excel_writer.close()
349
 
350
+ # Return the temporary file
 
351
  return temp_file.name
352
 
353
 
354
+ # Setup gradio interface
355
  interface = gr.Interface(
356
  fn=process_csv,
357
  inputs=[
 
367
  description="Upload a XLSX file and choose which column to check for duplicates."
368
  )
369
 
370
+ # Launch the interface
371
  interface.launch(debug=True,share=True)