Spaces:
Build error
Build error
Commit
·
4efd35b
1
Parent(s):
80b184f
ignore other accounts
Browse files- __pycache__/helper.cpython-312.pyc +0 -0
- app.py +2 -0
- helper.py +54 -37
__pycache__/helper.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/helper.cpython-312.pyc and b/__pycache__/helper.cpython-312.pyc differ
|
|
|
app.py
CHANGED
|
@@ -44,6 +44,7 @@ def process_files(excel_file, text_file):
|
|
| 44 |
|
| 45 |
# Ensure the 'Employer Number' column values are formatted as zero-padded 6-digit strings
|
| 46 |
df_excel['Employer Number'] = [str(number).zfill(6) for number in df_excel['Employer Number']]
|
|
|
|
| 47 |
|
| 48 |
# Read and process the text file content into a list of lines
|
| 49 |
lines = text_file.read().decode('utf-8').splitlines()
|
|
@@ -51,6 +52,7 @@ def process_files(excel_file, text_file):
|
|
| 51 |
|
| 52 |
# Create a DataFrame from the parsed text file data
|
| 53 |
df = pd.DataFrame(data)
|
|
|
|
| 54 |
|
| 55 |
return df_excel, df
|
| 56 |
|
|
|
|
| 44 |
|
| 45 |
# Ensure the 'Employer Number' column values are formatted as zero-padded 6-digit strings
|
| 46 |
df_excel['Employer Number'] = [str(number).zfill(6) for number in df_excel['Employer Number']]
|
| 47 |
+
df_excel = df_excel.dropna(subset=['Employer Name'])
|
| 48 |
|
| 49 |
# Read and process the text file content into a list of lines
|
| 50 |
lines = text_file.read().decode('utf-8').splitlines()
|
|
|
|
| 52 |
|
| 53 |
# Create a DataFrame from the parsed text file data
|
| 54 |
df = pd.DataFrame(data)
|
| 55 |
+
df = df[df[1].isin(['1001010071', '1001233102'])]
|
| 56 |
|
| 57 |
return df_excel, df
|
| 58 |
|
helper.py
CHANGED
|
@@ -107,7 +107,7 @@ def generate_df(master_data, df, employer_names):
|
|
| 107 |
"""
|
| 108 |
dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
|
| 109 |
bank_desc = list(df[9])
|
| 110 |
-
accounts = ['NASA' if i == '
|
| 111 |
credits = list(df[7])
|
| 112 |
|
| 113 |
# Initialize lists for employer-related fields
|
|
@@ -128,6 +128,17 @@ def generate_df(master_data, df, employer_names):
|
|
| 128 |
date_joined.append(np.nan)
|
| 129 |
termination_date.append(np.nan)
|
| 130 |
email_addr.append(np.nan)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
else:
|
| 132 |
tmp = master_data[master_data['Employer Name'] == name]
|
| 133 |
if tmp.empty:
|
|
@@ -203,6 +214,7 @@ def get_res_df(master_data, df, thrshld):
|
|
| 203 |
# Preprocess queries from transaction data
|
| 204 |
queries = list(df[9])
|
| 205 |
queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
|
|
|
|
| 206 |
empnos = [fetch_empno(text) for text in queries]
|
| 207 |
new_queries = [preprocess_query(query) for query in queries]
|
| 208 |
|
|
@@ -218,46 +230,51 @@ def get_res_df(master_data, df, thrshld):
|
|
| 218 |
exact_matches.append('')
|
| 219 |
|
| 220 |
res_names, found_by, scores = [], [], []
|
| 221 |
-
found_by_direct_search, found_by_emp_no, found_by_bm5, not_found = 0, 0, 0, 0
|
| 222 |
|
| 223 |
# Match each query to an employer
|
| 224 |
-
for query,empno_arr,exact_match in zip(new_queries,empnos,exact_matches):
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
|
| 260 |
# Generate the final result DataFrame
|
| 261 |
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
|
| 262 |
-
|
| 263 |
return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found
|
|
|
|
| 107 |
"""
|
| 108 |
dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
|
| 109 |
bank_desc = list(df[9])
|
| 110 |
+
accounts = ['NASA' if i == '1001010071' else 'EDAS' if i == '1001233102' else '' for i in df[1]]
|
| 111 |
credits = list(df[7])
|
| 112 |
|
| 113 |
# Initialize lists for employer-related fields
|
|
|
|
| 128 |
date_joined.append(np.nan)
|
| 129 |
termination_date.append(np.nan)
|
| 130 |
email_addr.append(np.nan)
|
| 131 |
+
elif name == "EDAS":
|
| 132 |
+
employer_codes.append(np.nan)
|
| 133 |
+
bank_statemnt_ref.append(np.nan)
|
| 134 |
+
account_mgr.append(np.nan)
|
| 135 |
+
emp_province.append(np.nan)
|
| 136 |
+
region.append(np.nan)
|
| 137 |
+
industry.append(np.nan)
|
| 138 |
+
contributing_stts.append(np.nan)
|
| 139 |
+
date_joined.append(np.nan)
|
| 140 |
+
termination_date.append(np.nan)
|
| 141 |
+
email_addr.append(np.nan)
|
| 142 |
else:
|
| 143 |
tmp = master_data[master_data['Employer Name'] == name]
|
| 144 |
if tmp.empty:
|
|
|
|
| 214 |
# Preprocess queries from transaction data
|
| 215 |
queries = list(df[9])
|
| 216 |
queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
|
| 217 |
+
acc_nos = list(df[1])
|
| 218 |
empnos = [fetch_empno(text) for text in queries]
|
| 219 |
new_queries = [preprocess_query(query) for query in queries]
|
| 220 |
|
|
|
|
| 230 |
exact_matches.append('')
|
| 231 |
|
| 232 |
res_names, found_by, scores = [], [], []
|
| 233 |
+
found_by_direct_search, found_by_emp_no, found_by_bm5, not_found, edas = 0, 0, 0, 0, 0
|
| 234 |
|
| 235 |
# Match each query to an employer
|
| 236 |
+
for query,empno_arr,exact_match,acc_no in zip(new_queries,empnos,exact_matches,acc_nos):
|
| 237 |
+
if acc_no == '1001233102':
|
| 238 |
+
edas+=1
|
| 239 |
+
res_names.append("EDAS")
|
| 240 |
+
found_by.append("EDAS")
|
| 241 |
+
else:
|
| 242 |
+
name = ""
|
| 243 |
+
# Find Employer by Direct Search
|
| 244 |
+
if exact_match!='':
|
| 245 |
+
name = exact_match
|
| 246 |
+
scores.append(100)
|
| 247 |
+
found_by_direct_search+=1
|
| 248 |
+
found_by.append("Direct Search")
|
| 249 |
+
res_names.append(name)
|
| 250 |
|
| 251 |
+
# Try to find an employer using the employee number if Direct Search Fails
|
| 252 |
+
elif len(empno_arr) != 0:
|
| 253 |
+
for empno in empno_arr:
|
| 254 |
+
names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
|
| 255 |
+
if len(names)!=0:
|
| 256 |
+
name=names[0]
|
| 257 |
+
scores.append(100) # Perfect match with employee number
|
| 258 |
+
found_by_emp_no+=1
|
| 259 |
+
found_by.append("Employer Number")
|
| 260 |
+
res_names.append(name)
|
| 261 |
+
break
|
| 262 |
+
# Fall back to BM25 matching if employee number fails
|
| 263 |
+
if name=="":
|
| 264 |
+
tokenized_query = query.split(" ")
|
| 265 |
+
name = bm25.get_top_n(tokenized_query, corpus, n=1)
|
| 266 |
+
doc_score = max(bm25.get_scores(tokenized_query))
|
| 267 |
+
scores.append(doc_score)
|
| 268 |
+
if doc_score>threshold:
|
| 269 |
+
found_by_bm5 += 1
|
| 270 |
+
res_names.append(name[0])
|
| 271 |
+
found_by.append("BM25")
|
| 272 |
+
else:
|
| 273 |
+
not_found+=1
|
| 274 |
+
res_names.append("NOT FOUND")
|
| 275 |
+
found_by.append("NOT FOUND")
|
| 276 |
|
| 277 |
# Generate the final result DataFrame
|
| 278 |
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
|
| 279 |
+
print(f"{found_by_direct_search=},{found_by_emp_no=},{found_by_bm5=},{not_found=},{edas=}")
|
| 280 |
return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found
|