Spaces:
Build error
Build error
Update pages/5SOURCE TO TARGET MAPPING.py
Browse files- pages/5SOURCE TO TARGET MAPPING.py +597 -595
pages/5SOURCE TO TARGET MAPPING.py
CHANGED
|
@@ -54,632 +54,634 @@ st.markdown("""
|
|
| 54 |
}
|
| 55 |
</style>
|
| 56 |
""", unsafe_allow_html=True)
|
| 57 |
-
|
| 58 |
-
def
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
bracket_count += 1
|
| 74 |
-
elif char == ')':
|
| 75 |
-
bracket_count -+ 1
|
| 76 |
-
if char == ',' and bracket_count == 0:
|
| 77 |
-
conditions.append(condition.strip())
|
| 78 |
-
condition = ''
|
| 79 |
-
else:
|
| 80 |
-
condition += char
|
| 81 |
-
if condition:
|
| 82 |
conditions.append(condition.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
join = []
|
| 88 |
-
join_pattern = re.compile(r'(\w+\.\w+)\s*=\s*(\w+\w.\w+)', re.IGNORECASE)
|
| 89 |
-
for join_condition in join_conditions:
|
| 90 |
-
parts = re.split(r'\sAND\s|\sOR\s', join_condition, flags = re.IGNORECASE)
|
| 91 |
-
temp = [x.strip() for x in parts if join_pattern.match(x.strip())]
|
| 92 |
-
join.append(' AND '.join(temp))
|
| 93 |
-
return join
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
source_column = row['Source Column']
|
| 133 |
-
joining_keys = row['Joining Keys']
|
| 134 |
-
primary_key = row['Primary Key']
|
| 135 |
-
direct_derived = row['Direct/Derived']
|
| 136 |
-
join_type = row['Join Type']
|
| 137 |
-
join_tables = row['Join Tables']
|
| 138 |
-
join_condition = row['Join Condition']
|
| 139 |
-
|
| 140 |
-
if source_table == base_table:
|
| 141 |
-
if primary_key == 'Y':
|
| 142 |
-
key = (source_table, joining_keys, join_type, join_tables, join_condition)
|
| 143 |
-
key1 = source_table
|
| 144 |
-
else:
|
| 145 |
-
continue
|
| 146 |
-
else:
|
| 147 |
key = (source_table, joining_keys, join_type, join_tables, join_condition)
|
| 148 |
key1 = source_table
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
'join_type': join_type,
|
| 155 |
-
'join_tables': join_tables,
|
| 156 |
'join_condition': join_condition
|
| 157 |
}
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
incr_query = []
|
| 182 |
-
incr_cols = ''
|
| 183 |
-
incr_tables = []
|
| 184 |
-
incr_join = {}
|
| 185 |
|
| 186 |
-
for
|
| 187 |
-
|
| 188 |
-
for table in _.split():
|
| 189 |
-
if base_table != table:
|
| 190 |
-
|
| 191 |
-
join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
|
| 192 |
-
join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
|
| 193 |
-
join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
|
| 194 |
-
join_cond = split_join_condition(incr_table_join_info[table])
|
| 195 |
-
join_condition = join_incr(join_cond)
|
| 196 |
-
source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
|
| 197 |
-
|
| 198 |
-
join_key_list = []
|
| 199 |
-
for x in join_keys:
|
| 200 |
-
join_key_list.append(f'{base_table}.{x}')
|
| 201 |
-
join_key = ', '.join(join_key_list)
|
| 202 |
-
|
| 203 |
-
for y in source_table:
|
| 204 |
-
sql = f"""
|
| 205 |
-
INSERT INTO {temp_table_schema}.{temp_table}_INCR
|
| 206 |
-
(
|
| 207 |
-
SELECT {join_key}, {table_details_mapping[y][0]}, {table_details_mapping[y][1]}, '{y}', 1, CURRENT_TIMESTAMP
|
| 208 |
-
FROM {source_table_schema}.{base_table} {base_table}"""
|
| 209 |
-
|
| 210 |
-
incr_join_text = ''
|
| 211 |
-
for i in range(len(join_condition)):
|
| 212 |
-
sql += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 213 |
-
incr_join_text += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 214 |
-
incr_join[y] = incr_join_text
|
| 215 |
-
|
| 216 |
-
sql += f"""
|
| 217 |
-
WHERE COALESCE({join_tables[i+1]}.operation,'NA') <> 'D'
|
| 218 |
-
AND TO_TIMESTAMP( CAST(SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),1,4) || '-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),5,2) ||'-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),7,2) || ' ' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),9,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),11,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS') > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{y}')
|
| 219 |
-
);"""
|
| 220 |
-
|
| 221 |
-
incr_query.append(sql)
|
| 222 |
-
incr_tables.append(y)
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
| 234 |
sql = f"""
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
(
|
| 251 |
-
SELECT DISTINCT {incr_cols.replace(f'{base_table}.', '')}
|
| 252 |
-
FROM {temp_table_schema}.{temp_table}_INCR
|
| 253 |
-
);"""
|
| 254 |
-
|
| 255 |
-
proc_query.append(sql)
|
| 256 |
-
|
| 257 |
-
incr_table_dict = {}
|
| 258 |
-
for table in incr_tables:
|
| 259 |
-
if table == base_table:
|
| 260 |
-
incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
|
| 261 |
-
else:
|
| 262 |
-
p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
|
| 263 |
-
if len(p) == 1:
|
| 264 |
-
incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
|
| 265 |
-
else:
|
| 266 |
-
incr_table_dict[table] = f'{source_table_schema}.{table}'
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
-
|
| 273 |
-
for key in incr_cols.replace(f'{base_table}.', '').split(','):
|
| 274 |
-
incr2_sql_list.append(f"{base_table}.{key} = A.{key}")
|
| 275 |
-
incr2_sql_join = ' AND '.join(incr2_sql_list)
|
| 276 |
-
|
| 277 |
sql = f"""
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
{temp_table_schema}.INCR1_{temp_table} A ON {incr2_sql_join}; """
|
| 286 |
-
proc_query.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};')
|
| 287 |
proc_query.append(sql)
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
else:
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
s.append(sql)
|
| 302 |
-
|
| 303 |
-
for x in s:
|
| 304 |
-
proc_query.append(x)
|
| 305 |
-
|
| 306 |
-
select_clause = []
|
| 307 |
-
from_clause = []
|
| 308 |
-
where_clause = []
|
| 309 |
-
|
| 310 |
-
for _,row in df.iterrows():
|
| 311 |
-
field_name = row['Field_Name']
|
| 312 |
-
source_table = row['Source Table']
|
| 313 |
-
source_column = row['Source Column']
|
| 314 |
-
joining_keys = row['Joining Keys']
|
| 315 |
-
primary_key = row['Primary Key']
|
| 316 |
-
direct_derived = row['Direct/Derived']
|
| 317 |
-
join_type = row['Join Type']
|
| 318 |
-
join_tables = row['Join Tables']
|
| 319 |
-
join_condition = row['Join Condition']
|
| 320 |
-
column_operation = row['Column Operations']
|
| 321 |
-
alias = row['Alias']
|
| 322 |
-
granularity = row['Granularity']
|
| 323 |
-
filter_condition = row['Filter Condition']
|
| 324 |
-
clauses = row['Clauses']
|
| 325 |
-
ordering = row['Ordering']
|
| 326 |
-
|
| 327 |
-
if pd.notna(direct_derived):
|
| 328 |
-
if pd.notna(column_operation):
|
| 329 |
-
if len(column_operation.split()) == 1:
|
| 330 |
-
select_expr = f'{column_operation.upper()}({source_table}.{source_column})'
|
| 331 |
-
else:
|
| 332 |
-
select_expr = column_operation
|
| 333 |
-
else:
|
| 334 |
-
if pd.notna(source_table):
|
| 335 |
-
select_expr = f'{source_table}.{source_column}'
|
| 336 |
-
else:
|
| 337 |
-
select_expr = source_column
|
| 338 |
-
|
| 339 |
-
if source_column not in join_fields:
|
| 340 |
-
if pd.notna(alias):
|
| 341 |
-
select_expr += f' AS {alias}'
|
| 342 |
-
else:
|
| 343 |
-
if pd.notna(column_operation) and pd.notna(source_column):
|
| 344 |
-
select_expr += f' AS {source_column}'
|
| 345 |
-
|
| 346 |
-
if direct_derived.upper() == 'DIRECT':
|
| 347 |
-
select_clause.append(select_expr)
|
| 348 |
-
elif direct_derived.upper() == 'DERIVED_BASE':
|
| 349 |
-
select_clause.append(select_expr)
|
| 350 |
-
|
| 351 |
-
if pd.notna(filter_condition):
|
| 352 |
-
where_clause.append(filter_condition)
|
| 353 |
-
|
| 354 |
-
select_query = ',\n\t'.join(select_clause)
|
| 355 |
-
sql_query = f"CREATE TABLE {temp_table_schema}.{base_table}_BASE\nAS \n\tSELECT \n\t{select_query} \nFROM\n\t{incr_table_dict[base_table]} {base_table}"
|
| 356 |
-
if where_clause:
|
| 357 |
-
sql_query += f"\nWHERE {' AND'.join(where_clause)}"
|
| 358 |
-
sql_query += ';'
|
| 359 |
-
proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{base_table}_BASE;")
|
| 360 |
-
proc_query.append(sql_query)
|
| 361 |
-
|
| 362 |
-
df['Clauses'].fillna('', inplace = True)
|
| 363 |
-
df['Ordering'].fillna('', inplace = True)
|
| 364 |
-
c = 1
|
| 365 |
-
temp_base_table = f'{base_table}_BASE'
|
| 366 |
-
grp_cols = ['Join Condition', 'Clauses', 'Ordering']
|
| 367 |
-
join_grps = df[df['Direct/Derived'] == 'DERIVED'].groupby(['Join Condition', 'Clauses', 'Ordering'])
|
| 368 |
-
temp_tables_sql = []
|
| 369 |
-
for (join_condition,clauses,ordering), group in join_grps:
|
| 370 |
-
if pd.notna(group['Direct/Derived'].iloc[0]):
|
| 371 |
-
if group['Direct/Derived'].iloc[0].upper() == 'DERIVED':
|
| 372 |
-
join_tables = [t.strip() for t in group['Join Tables'].iloc[0].split(',')]
|
| 373 |
-
join_keys = [t.strip() for t in group['Joining Keys'].iloc[0].split(',')]
|
| 374 |
-
join_type = [t.strip() for t in group['Join Type'].iloc[0].split(',')]
|
| 375 |
-
join_condition = split_join_condition(group['Join Condition'].iloc[0])
|
| 376 |
-
temp_table_name = f"TEMP_{group['Source Table'].iloc[0]}"
|
| 377 |
-
source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
|
| 378 |
-
alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
|
| 379 |
-
source_table = [t.strip() for t in (','.join(group['Source Table'])).split(',')]
|
| 380 |
-
|
| 381 |
-
base_cols = []
|
| 382 |
-
for join_key in join_keys:
|
| 383 |
-
base_cols.append(f'{join_tables[0]}.{join_key}')
|
| 384 |
-
|
| 385 |
-
for s_table,col,alias in zip(source_table,source_column,alias):
|
| 386 |
-
if pd.notna(group['Column Operations'].iloc[0]):
|
| 387 |
-
if len(group['Column Operations'].iloc[0].split()) == 1:
|
| 388 |
-
select_expr = f"{group['Column Operations'].iloc[0].upper()}({s_table}.{col})"
|
| 389 |
-
else:
|
| 390 |
-
select_expr = group['Column Operations'].iloc[0]
|
| 391 |
-
else:
|
| 392 |
-
if pd.notna(s_table):
|
| 393 |
-
select_expr = f"{s_table}.{col}"
|
| 394 |
-
else:
|
| 395 |
-
select_expr = col
|
| 396 |
-
|
| 397 |
-
if alias:
|
| 398 |
-
select_expr += f" AS {alias}"
|
| 399 |
-
base_cols.append(select_expr)
|
| 400 |
-
|
| 401 |
-
if ordering:
|
| 402 |
-
base_cols.append(f"{ordering} AS RN")
|
| 403 |
-
|
| 404 |
-
sql = ',\n\t\t'.join(base_cols)
|
| 405 |
-
|
| 406 |
-
join_sql = f"SELECT \n\t\t{sql} \nFROM\n\t{incr_table_dict[base_table]} {join_tables[0]}"
|
| 407 |
-
for i in range(len(join_type)):
|
| 408 |
-
join_sql += f'\n\t{join_type[i]} JOIN {incr_table_dict[join_tables[i+1]]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 409 |
-
if clauses:
|
| 410 |
-
join_sql += f'\n\t{clauses}'
|
| 411 |
-
join_sql += ';'
|
| 412 |
-
|
| 413 |
-
proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{temp_table_name};")
|
| 414 |
-
proc_query.append(f"CREATE TABLE {temp_table_schema}.{temp_table_name}\nAS \n\t{join_sql}")
|
| 415 |
-
|
| 416 |
-
granularity = [t.strip() for t in group['Granularity'].iloc[0].split(',')]
|
| 417 |
-
|
| 418 |
-
sql = []
|
| 419 |
-
for key in join_keys:
|
| 420 |
-
sql.append(f"A.{key} = B.{key}")
|
| 421 |
-
|
| 422 |
-
temp_cols = []
|
| 423 |
-
temp_cols.append('A.*')
|
| 424 |
-
|
| 425 |
-
source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
|
| 426 |
-
alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
|
| 427 |
-
|
| 428 |
-
for col,alias in zip(source_column,alias):
|
| 429 |
-
select_expr = f"B.{col}"
|
| 430 |
-
if alias:
|
| 431 |
-
select_expr = f"B.{alias}"
|
| 432 |
-
else:
|
| 433 |
-
select_expr = f"B.{col}"
|
| 434 |
-
temp_cols.append(select_expr)
|
| 435 |
-
|
| 436 |
-
temp_select_query = ',\n\t\t'.join(temp_cols)
|
| 437 |
-
|
| 438 |
-
proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.TEMP_{temp_table}_{c};")
|
| 439 |
-
|
| 440 |
-
base_sql = f"CREATE TABLE {temp_table_schema}.TEMP_{temp_table}_{c}\nAS \n\tSELECT \n\t\t{temp_select_query} \nFROM\n\t{temp_table_schema}.{temp_base_table} AS A"
|
| 441 |
-
base_sql += f"\n\tLEFT OUTER JOIN {temp_table_schema}.{temp_table_name} B ON {' AND '.join(sql)}"
|
| 442 |
-
|
| 443 |
-
if '1:1' in granularity and len(ordering) > 1:
|
| 444 |
-
base_sql += f" AND B.RN = 1"
|
| 445 |
-
base_sql += ';'
|
| 446 |
-
|
| 447 |
-
temp_base_table = f'TEMP_{temp_table}_{c}'
|
| 448 |
-
c += 1
|
| 449 |
-
proc_query.append(base_sql)
|
| 450 |
-
|
| 451 |
-
fin_table_name = temp_table
|
| 452 |
-
fin_table_cols = []
|
| 453 |
-
|
| 454 |
-
for _,row in df.iterrows():
|
| 455 |
-
field_name = row['Field_Name']
|
| 456 |
-
source_table = row['Source Table']
|
| 457 |
-
source_column = row['Source Column']
|
| 458 |
-
alias = row['Alias']
|
| 459 |
-
|
| 460 |
-
if pd.notna(row['Direct/Derived']):
|
| 461 |
-
if (source_column in join_fields):
|
| 462 |
-
fin_table_cols.append(f'{source_column} AS "{field_name}"')
|
| 463 |
-
else:
|
| 464 |
-
fin_table_cols.append(f'"{field_name}"')
|
| 465 |
-
|
| 466 |
-
fin_table_cols = ',\n\t\t'.join(fin_table_cols)
|
| 467 |
-
fin_sql = f"INSERT INTO {temp_table_schema}.{fin_table_name}\n\tSELECT \n\t\t{fin_table_cols} \nFROM\n\t{temp_table_schema}.TEMP_{temp_table}_{c-1};"
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
condition_col = '_'.join(incr_cols.replace(f'{base_table}.', '').split(','))
|
| 471 |
-
proc_query.append(f"DELETE FROM {temp_table_schema}.{fin_table_name}\nWHERE {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} IN (SELECT {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} FROM {temp_table_schema}.INCR1_{temp_table});")
|
| 472 |
-
proc_query.append(fin_sql)
|
| 473 |
-
|
| 474 |
-
for table in incr_tables:
|
| 475 |
sql = f"""
|
| 476 |
-
|
| 477 |
-
|
| 478 |
SELECT
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
FROM {temp_table_schema}.{temp_table}_INCR where table_name = '{table}'
|
| 486 |
-
);"""
|
| 487 |
proc_query.append(sql)
|
| 488 |
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
def create_df(query, table_df_mapping, table_usage_count):
|
| 492 |
-
script = []
|
| 493 |
-
query = ' '.join(query.split()).strip()
|
| 494 |
-
match = re.match(r'CREATE TABLE (\w+\.\w+\.\w+) AS (SELECT .+)', query, re.IGNORECASE)
|
| 495 |
-
source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
|
| 496 |
-
source_tables = [table for pair in source_tables for table in pair if table]
|
| 497 |
-
|
| 498 |
-
if not match:
|
| 499 |
-
raise ValueError('Invalid SQL')
|
| 500 |
-
table_name = match.group(1).split('.')[2]
|
| 501 |
-
select_statement = match.group(2)
|
| 502 |
-
create_script = f'{table_name} = spark.sql(""" {select_statement} """)'
|
| 503 |
-
persist_script = f'{table_name} = {table_name}.persist()'
|
| 504 |
-
view_script = f'{table_name}.createOrReplaceTempView("{table_name}")'
|
| 505 |
-
|
| 506 |
-
for table in source_tables:
|
| 507 |
-
create_script = create_script.replace(table, table_df_mapping[table])
|
| 508 |
-
|
| 509 |
-
script.append(f"\n\t\t######################---------Creating table {create_script.split('=')[0].strip()}-------############################")
|
| 510 |
-
script.append(create_script)
|
| 511 |
-
script.append(persist_script)
|
| 512 |
-
script.append(view_script)
|
| 513 |
-
script.append(f'''print("{create_script.split('=')[0].strip()} count: ", {create_script.split('=')[0].strip()}.count()''')
|
| 514 |
-
|
| 515 |
-
if 'INCR2_' in table_name:
|
| 516 |
-
x = table_name.split('INCR2_')[1]
|
| 517 |
-
if x in table_details_mapping.keys():
|
| 518 |
-
script.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################")
|
| 519 |
-
script.append(f"{x}_max_update_date = INCR2_{x}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]")
|
| 520 |
-
script.append(f"{x}_max_source_reference_date = INCR2_{x}.agg(max(to_timestamp('{table_details_mapping[x][1].replace(x+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]")
|
| 521 |
-
script.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{x}',{x}_max_update_date,{x}_max_source_reference_date, max_batch_id, config)")
|
| 522 |
-
script.append('\n')
|
| 523 |
-
|
| 524 |
-
for table in source_tables:
|
| 525 |
-
table_usage_count[table.split('.')[2]] -= 1
|
| 526 |
|
| 527 |
-
|
| 528 |
-
if
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
for _, group in incr_join_grps:
|
| 570 |
-
for table in _.split():
|
| 571 |
-
if base_table != table:
|
| 572 |
-
join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
|
| 573 |
-
join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
|
| 574 |
-
join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
|
| 575 |
-
join_cond = split_join_condition(incr_table_join_info[table])
|
| 576 |
-
join_condition = join_incr(join_cond)
|
| 577 |
-
source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
|
| 578 |
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
| 650 |
|
| 651 |
-
|
|
|
|
| 652 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
#
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
st.subheader('AUTOMATED SOURCE TO TARGET MAPPING')
|
| 684 |
mode= st.selectbox('Select Mode of Mapping',('Supervised Mapping(You Have Sufficient Sample Data in Target Template)', 'Unsupervised Mapping(You Do Not Have Sufficient Sample Data in Target Template)'), index=None,placeholder='Select category of table')
|
| 685 |
if mode == 'Supervised Mapping(You Have Sufficient Sample Data in Target Template)':
|
|
|
|
| 54 |
}
|
| 55 |
</style>
|
| 56 |
""", unsafe_allow_html=True)
|
| 57 |
+
|
| 58 |
+
def read_excel(path, sheet):
|
| 59 |
+
df = pd.read_excel(path, sheet_name = sheet, dtype = 'str')
|
| 60 |
+
return df
|
| 61 |
+
|
| 62 |
+
def split_join_condition(join_condition):
|
| 63 |
+
conditions = []
|
| 64 |
+
condition = ''
|
| 65 |
+
bracket_count = 0
|
| 66 |
+
|
| 67 |
+
for char in join_condition:
|
| 68 |
+
if char == '(':
|
| 69 |
+
bracket_count += 1
|
| 70 |
+
elif char == ')':
|
| 71 |
+
bracket_count -+ 1
|
| 72 |
+
if char == ',' and bracket_count == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
conditions.append(condition.strip())
|
| 74 |
+
condition = ''
|
| 75 |
+
else:
|
| 76 |
+
condition += char
|
| 77 |
+
if condition:
|
| 78 |
+
conditions.append(condition.strip())
|
| 79 |
+
|
| 80 |
+
return conditions
|
| 81 |
+
|
| 82 |
+
def join_incr(join_conditions):
|
| 83 |
+
join = []
|
| 84 |
+
join_pattern = re.compile(r'(\w+\.\w+)\s*=\s*(\w+\w.\w+)', re.IGNORECASE)
|
| 85 |
+
for join_condition in join_conditions:
|
| 86 |
+
parts = re.split(r'\sAND\s|\sOR\s', join_condition, flags = re.IGNORECASE)
|
| 87 |
+
temp = [x.strip() for x in parts if join_pattern.match(x.strip())]
|
| 88 |
+
join.append(' AND '.join(temp))
|
| 89 |
+
return join
|
| 90 |
+
|
| 91 |
+
def generate_sql(temp_table):
|
| 92 |
+
proc_query = []
|
| 93 |
+
base_table = None
|
| 94 |
|
| 95 |
+
source_table_schema = 'MAIN.GOLD'
|
| 96 |
+
temp_table_schema = 'MAIN.GOLD'
|
| 97 |
+
base_pk = []
|
| 98 |
|
| 99 |
+
join_fields = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
for _,row in df.iterrows():
|
| 102 |
+
source_table = row['Source Table']
|
| 103 |
+
primary_key = row['Primary Key']
|
| 104 |
+
source_column = row['Source Column']
|
| 105 |
+
alias = row['Alias']
|
| 106 |
+
joining_keys = row['Joining Keys']
|
| 107 |
+
|
| 108 |
+
if not base_table:
|
| 109 |
+
if primary_key == 'Y':
|
| 110 |
+
base_table = source_table
|
| 111 |
+
base_pk.append(joining_keys)
|
| 112 |
+
|
| 113 |
+
if pd.notna(joining_keys):
|
| 114 |
+
keys = [x.strip() for x in joining_keys.split(',')]
|
| 115 |
+
for x in keys:
|
| 116 |
+
if x not in join_fields:
|
| 117 |
+
join_fields.add(x)
|
| 118 |
+
|
| 119 |
+
unique_cols = ['Source Table', 'Joining Keys', 'Primary Key', 'Join Type','Join Tables','Join Condition']
|
| 120 |
+
unique_df = df.drop_duplicates(subset = unique_cols)
|
| 121 |
+
|
| 122 |
+
incremantal_mapping = {}
|
| 123 |
+
incr_joins = {}
|
| 124 |
+
|
| 125 |
+
for _,row in unique_df.iterrows():
|
| 126 |
+
|
| 127 |
+
source_table = row['Source Table']
|
| 128 |
+
source_column = row['Source Column']
|
| 129 |
+
joining_keys = row['Joining Keys']
|
| 130 |
+
primary_key = row['Primary Key']
|
| 131 |
+
direct_derived = row['Direct/Derived']
|
| 132 |
+
join_type = row['Join Type']
|
| 133 |
+
join_tables = row['Join Tables']
|
| 134 |
+
join_condition = row['Join Condition']
|
| 135 |
+
|
| 136 |
+
if source_table == base_table:
|
| 137 |
+
if primary_key == 'Y':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
key = (source_table, joining_keys, join_type, join_tables, join_condition)
|
| 139 |
key1 = source_table
|
| 140 |
+
else:
|
| 141 |
+
continue
|
| 142 |
+
else:
|
| 143 |
+
key = (source_table, joining_keys, join_type, join_tables, join_condition)
|
| 144 |
+
key1 = source_table
|
| 145 |
+
if pd.notna(direct_derived) and pd.notna(source_table) and pd.notna(source_column):
|
| 146 |
+
if key not in incremantal_mapping:
|
| 147 |
+
incremantal_mapping[key] = {
|
| 148 |
+
'source_table': source_table,
|
| 149 |
+
'joining_keys':joining_keys,
|
| 150 |
+
'join_type': join_type,
|
| 151 |
+
'join_tables': join_tables,
|
| 152 |
+
'join_condition': join_condition
|
| 153 |
+
}
|
| 154 |
+
if key1 not in incr_joins:
|
| 155 |
+
if pd.notna(direct_derived) and direct_derived == 'DERIVED':
|
| 156 |
+
incr_joins[key1] = {
|
| 157 |
'join_type': join_type,
|
| 158 |
+
'join_tables': ', '.join([x.strip() for x in join_tables.split(',') if x != base_table]),
|
| 159 |
'join_condition': join_condition
|
| 160 |
}
|
| 161 |
+
incremental_df = pd.DataFrame(incremantal_mapping.values())
|
| 162 |
+
incr_join_grps = incremental_df.groupby(['source_table'])
|
| 163 |
+
proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.{temp_table}_INCR;')
|
| 164 |
+
|
| 165 |
+
incr_table_join_info = {}
|
| 166 |
+
for _,row in incremental_df.iterrows():
|
| 167 |
+
source_table = row['source_table']
|
| 168 |
+
|
| 169 |
+
if source_table != base_table:
|
| 170 |
+
joining_keys = row['joining_keys']
|
| 171 |
+
join_type = row['join_type']
|
| 172 |
+
join_tables = [x.strip() for x in row['join_tables'].split(',')]
|
| 173 |
+
index = join_tables.index(source_table)
|
| 174 |
+
join_condition = [x.strip() for x in row['join_condition'].split(',')][0:index]
|
| 175 |
+
incr_table_join_info[source_table] = ', '.join(join_condition)
|
| 176 |
+
|
| 177 |
+
incr_query = []
|
| 178 |
+
incr_cols = ''
|
| 179 |
+
incr_tables = []
|
| 180 |
+
incr_join = {}
|
| 181 |
+
|
| 182 |
+
for _, group in incr_join_grps:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
+
for table in _.split():
|
| 185 |
+
if base_table != table:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
+
join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
|
| 188 |
+
join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
|
| 189 |
+
join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
|
| 190 |
+
join_cond = split_join_condition(incr_table_join_info[table])
|
| 191 |
+
join_condition = join_incr(join_cond)
|
| 192 |
+
source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
|
| 193 |
+
|
| 194 |
+
join_key_list = []
|
| 195 |
+
for x in join_keys:
|
| 196 |
+
join_key_list.append(f'{base_table}.{x}')
|
| 197 |
+
join_key = ', '.join(join_key_list)
|
| 198 |
+
|
| 199 |
+
for y in source_table:
|
| 200 |
sql = f"""
|
| 201 |
+
INSERT INTO {temp_table_schema}.{temp_table}_INCR
|
| 202 |
+
(
|
| 203 |
+
SELECT {join_key}, {table_details_mapping[y][0]}, {table_details_mapping[y][1]}, '{y}', 1, CURRENT_TIMESTAMP
|
| 204 |
+
FROM {source_table_schema}.{base_table} {base_table}"""
|
| 205 |
+
|
| 206 |
+
incr_join_text = ''
|
| 207 |
+
for i in range(len(join_condition)):
|
| 208 |
+
sql += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 209 |
+
incr_join_text += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 210 |
+
incr_join[y] = incr_join_text
|
| 211 |
+
|
| 212 |
+
sql += f"""
|
| 213 |
+
WHERE COALESCE({join_tables[i+1]}.operation,'NA') <> 'D'
|
| 214 |
+
AND TO_TIMESTAMP( CAST(SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),1,4) || '-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),5,2) ||'-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),7,2) || ' ' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),9,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),11,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS') > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{y}')
|
| 215 |
+
);"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
+
incr_query.append(sql)
|
| 218 |
+
incr_tables.append(y)
|
| 219 |
+
|
| 220 |
+
else:
|
| 221 |
+
source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
|
| 222 |
+
join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')]
|
| 223 |
+
|
| 224 |
+
join_key_list = []
|
| 225 |
+
for x in join_keys:
|
| 226 |
+
join_key_list.append(f'{base_table}.{x}')
|
| 227 |
+
join_key = ', '.join(join_key_list)
|
| 228 |
|
| 229 |
+
incr_cols = join_key
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
sql = f"""
|
| 231 |
+
INSERT INTO {temp_table_schema}.{temp_table}_INCR
|
| 232 |
+
(
|
| 233 |
+
SELECT {join_key}, {table_details_mapping[base_table][0]}, {table_details_mapping[base_table][1]}, '{base_table}', 1, CURRENT_TIMESTAMP
|
| 234 |
+
FROM {source_table_schema}.{base_table} {base_table}
|
| 235 |
+
WHERE COALESCE(operation,'NA') <> 'D'
|
| 236 |
+
AND TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS') > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{base_table}')
|
| 237 |
+
);"""
|
|
|
|
|
|
|
| 238 |
proc_query.append(sql)
|
| 239 |
+
incr_tables.append(base_table)
|
| 240 |
+
|
| 241 |
+
proc_query.append('\n'.join(incr_query))
|
| 242 |
+
proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.INCR1_{temp_table};')
|
| 243 |
+
|
| 244 |
+
sql = f"""
|
| 245 |
+
INSERT INTO {temp_table_schema}.INCR1_{temp_table}
|
| 246 |
+
(
|
| 247 |
+
SELECT DISTINCT {incr_cols.replace(f'{base_table}.', '')}
|
| 248 |
+
FROM {temp_table_schema}.{temp_table}_INCR
|
| 249 |
+
);"""
|
| 250 |
+
|
| 251 |
+
proc_query.append(sql)
|
| 252 |
+
|
| 253 |
+
incr_table_dict = {}
|
| 254 |
+
for table in incr_tables:
|
| 255 |
+
if table == base_table:
|
| 256 |
+
incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
|
| 257 |
+
else:
|
| 258 |
+
p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
|
| 259 |
+
if len(p) == 1:
|
| 260 |
+
incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
|
| 261 |
else:
|
| 262 |
+
incr_table_dict[table] = f'{source_table_schema}.{table}'
|
| 263 |
+
|
| 264 |
+
s = []
|
| 265 |
+
for table in incr_tables:
|
| 266 |
+
incr2_sql_list = []
|
| 267 |
+
|
| 268 |
+
if table == base_table:
|
| 269 |
+
for key in incr_cols.replace(f'{base_table}.', '').split(','):
|
| 270 |
+
incr2_sql_list.append(f"{base_table}.{key} = A.{key}")
|
| 271 |
+
incr2_sql_join = ' AND '.join(incr2_sql_list)
|
| 272 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
sql = f"""
|
| 274 |
+
CREATE TABLE {temp_table_schema}.INCR2_{table}
|
| 275 |
+
AS
|
| 276 |
SELECT
|
| 277 |
+
{table}.*
|
| 278 |
+
FROM
|
| 279 |
+
{source_table_schema}.{table} {table}
|
| 280 |
+
INNER JOIN
|
| 281 |
+
{temp_table_schema}.INCR1_{temp_table} A ON {incr2_sql_join}; """
|
| 282 |
+
proc_query.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};')
|
|
|
|
|
|
|
| 283 |
proc_query.append(sql)
|
| 284 |
|
| 285 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
+
p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
|
| 288 |
+
if len(p) == 1:
|
| 289 |
+
sql = f"""
|
| 290 |
+
CREATE TABLE {temp_table_schema}.INCR2_{table}
|
| 291 |
+
AS
|
| 292 |
+
SELECT
|
| 293 |
+
{table}.*
|
| 294 |
+
FROM
|
| 295 |
+
{temp_table_schema}.INCR2_{base_table} {base_table} {incr_join[table]};"""
|
| 296 |
+
s.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};')
|
| 297 |
+
s.append(sql)
|
| 298 |
|
| 299 |
+
for x in s:
|
| 300 |
+
proc_query.append(x)
|
| 301 |
+
|
| 302 |
+
select_clause = []
|
| 303 |
+
from_clause = []
|
| 304 |
+
where_clause = []
|
| 305 |
+
|
| 306 |
+
for _,row in df.iterrows():
|
| 307 |
+
field_name = row['Field_Name']
|
| 308 |
+
source_table = row['Source Table']
|
| 309 |
+
source_column = row['Source Column']
|
| 310 |
+
joining_keys = row['Joining Keys']
|
| 311 |
+
primary_key = row['Primary Key']
|
| 312 |
+
direct_derived = row['Direct/Derived']
|
| 313 |
+
join_type = row['Join Type']
|
| 314 |
+
join_tables = row['Join Tables']
|
| 315 |
+
join_condition = row['Join Condition']
|
| 316 |
+
column_operation = row['Column Operations']
|
| 317 |
+
alias = row['Alias']
|
| 318 |
+
granularity = row['Granularity']
|
| 319 |
+
filter_condition = row['Filter Condition']
|
| 320 |
+
clauses = row['Clauses']
|
| 321 |
+
ordering = row['Ordering']
|
| 322 |
+
|
| 323 |
+
if pd.notna(direct_derived):
|
| 324 |
+
if pd.notna(column_operation):
|
| 325 |
+
if len(column_operation.split()) == 1:
|
| 326 |
+
select_expr = f'{column_operation.upper()}({source_table}.{source_column})'
|
| 327 |
+
else:
|
| 328 |
+
select_expr = column_operation
|
| 329 |
+
else:
|
| 330 |
+
if pd.notna(source_table):
|
| 331 |
+
select_expr = f'{source_table}.{source_column}'
|
| 332 |
+
else:
|
| 333 |
+
select_expr = source_column
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
+
if source_column not in join_fields:
|
| 336 |
+
if pd.notna(alias):
|
| 337 |
+
select_expr += f' AS {alias}'
|
| 338 |
+
else:
|
| 339 |
+
if pd.notna(column_operation) and pd.notna(source_column):
|
| 340 |
+
select_expr += f' AS {source_column}'
|
| 341 |
+
|
| 342 |
+
if direct_derived.upper() == 'DIRECT':
|
| 343 |
+
select_clause.append(select_expr)
|
| 344 |
+
elif direct_derived.upper() == 'DERIVED_BASE':
|
| 345 |
+
select_clause.append(select_expr)
|
| 346 |
|
| 347 |
+
if pd.notna(filter_condition):
|
| 348 |
+
where_clause.append(filter_condition)
|
| 349 |
+
|
| 350 |
+
select_query = ',\n\t'.join(select_clause)
|
| 351 |
+
sql_query = f"CREATE TABLE {temp_table_schema}.{base_table}_BASE\nAS \n\tSELECT \n\t{select_query} \nFROM\n\t{incr_table_dict[base_table]} {base_table}"
|
| 352 |
+
if where_clause:
|
| 353 |
+
sql_query += f"\nWHERE {' AND'.join(where_clause)}"
|
| 354 |
+
sql_query += ';'
|
| 355 |
+
proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{base_table}_BASE;")
|
| 356 |
+
proc_query.append(sql_query)
|
| 357 |
+
|
| 358 |
+
df['Clauses'].fillna('', inplace = True)
|
| 359 |
+
df['Ordering'].fillna('', inplace = True)
|
| 360 |
+
c = 1
|
| 361 |
+
temp_base_table = f'{base_table}_BASE'
|
| 362 |
+
grp_cols = ['Join Condition', 'Clauses', 'Ordering']
|
| 363 |
+
join_grps = df[df['Direct/Derived'] == 'DERIVED'].groupby(['Join Condition', 'Clauses', 'Ordering'])
|
| 364 |
+
temp_tables_sql = []
|
| 365 |
+
for (join_condition,clauses,ordering), group in join_grps:
|
| 366 |
+
if pd.notna(group['Direct/Derived'].iloc[0]):
|
| 367 |
+
if group['Direct/Derived'].iloc[0].upper() == 'DERIVED':
|
| 368 |
+
join_tables = [t.strip() for t in group['Join Tables'].iloc[0].split(',')]
|
| 369 |
+
join_keys = [t.strip() for t in group['Joining Keys'].iloc[0].split(',')]
|
| 370 |
+
join_type = [t.strip() for t in group['Join Type'].iloc[0].split(',')]
|
| 371 |
+
join_condition = split_join_condition(group['Join Condition'].iloc[0])
|
| 372 |
+
temp_table_name = f"TEMP_{group['Source Table'].iloc[0]}"
|
| 373 |
+
source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
|
| 374 |
+
alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
|
| 375 |
+
source_table = [t.strip() for t in (','.join(group['Source Table'])).split(',')]
|
| 376 |
+
|
| 377 |
+
base_cols = []
|
| 378 |
+
for join_key in join_keys:
|
| 379 |
+
base_cols.append(f'{join_tables[0]}.{join_key}')
|
| 380 |
+
|
| 381 |
+
for s_table,col,alias in zip(source_table,source_column,alias):
|
| 382 |
+
if pd.notna(group['Column Operations'].iloc[0]):
|
| 383 |
+
if len(group['Column Operations'].iloc[0].split()) == 1:
|
| 384 |
+
select_expr = f"{group['Column Operations'].iloc[0].upper()}({s_table}.{col})"
|
| 385 |
+
else:
|
| 386 |
+
select_expr = group['Column Operations'].iloc[0]
|
| 387 |
+
else:
|
| 388 |
+
if pd.notna(s_table):
|
| 389 |
+
select_expr = f"{s_table}.{col}"
|
| 390 |
+
else:
|
| 391 |
+
select_expr = col
|
| 392 |
|
| 393 |
+
if alias:
|
| 394 |
+
select_expr += f" AS {alias}"
|
| 395 |
+
base_cols.append(select_expr)
|
| 396 |
+
|
| 397 |
+
if ordering:
|
| 398 |
+
base_cols.append(f"{ordering} AS RN")
|
| 399 |
+
|
| 400 |
+
sql = ',\n\t\t'.join(base_cols)
|
| 401 |
+
|
| 402 |
+
join_sql = f"SELECT \n\t\t{sql} \nFROM\n\t{incr_table_dict[base_table]} {join_tables[0]}"
|
| 403 |
+
for i in range(len(join_type)):
|
| 404 |
+
join_sql += f'\n\t{join_type[i]} JOIN {incr_table_dict[join_tables[i+1]]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 405 |
+
if clauses:
|
| 406 |
+
join_sql += f'\n\t{clauses}'
|
| 407 |
+
join_sql += ';'
|
| 408 |
+
|
| 409 |
+
proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{temp_table_name};")
|
| 410 |
+
proc_query.append(f"CREATE TABLE {temp_table_schema}.{temp_table_name}\nAS \n\t{join_sql}")
|
| 411 |
+
|
| 412 |
+
granularity = [t.strip() for t in group['Granularity'].iloc[0].split(',')]
|
| 413 |
+
|
| 414 |
+
sql = []
|
| 415 |
+
for key in join_keys:
|
| 416 |
+
sql.append(f"A.{key} = B.{key}")
|
| 417 |
+
|
| 418 |
+
temp_cols = []
|
| 419 |
+
temp_cols.append('A.*')
|
| 420 |
+
|
| 421 |
+
source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
|
| 422 |
+
alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
|
| 423 |
+
|
| 424 |
+
for col,alias in zip(source_column,alias):
|
| 425 |
+
select_expr = f"B.{col}"
|
| 426 |
+
if alias:
|
| 427 |
+
select_expr = f"B.{alias}"
|
| 428 |
+
else:
|
| 429 |
+
select_expr = f"B.{col}"
|
| 430 |
+
temp_cols.append(select_expr)
|
| 431 |
+
|
| 432 |
+
temp_select_query = ',\n\t\t'.join(temp_cols)
|
| 433 |
+
|
| 434 |
+
proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.TEMP_{temp_table}_{c};")
|
| 435 |
+
|
| 436 |
+
base_sql = f"CREATE TABLE {temp_table_schema}.TEMP_{temp_table}_{c}\nAS \n\tSELECT \n\t\t{temp_select_query} \nFROM\n\t{temp_table_schema}.{temp_base_table} AS A"
|
| 437 |
+
base_sql += f"\n\tLEFT OUTER JOIN {temp_table_schema}.{temp_table_name} B ON {' AND '.join(sql)}"
|
| 438 |
+
|
| 439 |
+
if '1:1' in granularity and len(ordering) > 1:
|
| 440 |
+
base_sql += f" AND B.RN = 1"
|
| 441 |
+
base_sql += ';'
|
| 442 |
+
|
| 443 |
+
temp_base_table = f'TEMP_{temp_table}_{c}'
|
| 444 |
+
c += 1
|
| 445 |
+
proc_query.append(base_sql)
|
| 446 |
|
| 447 |
+
fin_table_name = temp_table
|
| 448 |
+
fin_table_cols = []
|
| 449 |
|
| 450 |
+
for _,row in df.iterrows():
|
| 451 |
+
field_name = row['Field_Name']
|
| 452 |
+
source_table = row['Source Table']
|
| 453 |
+
source_column = row['Source Column']
|
| 454 |
+
alias = row['Alias']
|
| 455 |
|
| 456 |
+
if pd.notna(row['Direct/Derived']):
|
| 457 |
+
if (source_column in join_fields):
|
| 458 |
+
fin_table_cols.append(f'{source_column} AS "{field_name}"')
|
| 459 |
+
else:
|
| 460 |
+
fin_table_cols.append(f'"{field_name}"')
|
| 461 |
+
|
| 462 |
+
fin_table_cols = ',\n\t\t'.join(fin_table_cols)
|
| 463 |
+
fin_sql = f"INSERT INTO {temp_table_schema}.{fin_table_name}\n\tSELECT \n\t\t{fin_table_cols} \nFROM\n\t{temp_table_schema}.TEMP_{temp_table}_{c-1};"
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
condition_col = '_'.join(incr_cols.replace(f'{base_table}.', '').split(','))
|
| 467 |
+
proc_query.append(f"DELETE FROM {temp_table_schema}.{fin_table_name}\nWHERE {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} IN (SELECT {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} FROM {temp_table_schema}.INCR1_{temp_table});")
|
| 468 |
+
proc_query.append(fin_sql)
|
| 469 |
|
| 470 |
+
for table in incr_tables:
|
| 471 |
+
sql = f"""
|
| 472 |
+
INSERT INTO audit.reportingdb_audit_tbl_{temp_table}
|
| 473 |
+
(
|
| 474 |
+
SELECT
|
| 475 |
+
'{temp_table}' as mart_table_name,
|
| 476 |
+
'{table}' as src_table_name,
|
| 477 |
+
coalesce( max(TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')),(select max(max_update_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) max_update_date,
|
| 478 |
+
CURRENT_TIMESTAMP as load_timestamp,
|
| 479 |
+
coalesce(max(prev_updt_ts),(select max(source_reference_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) AS source_reference_date,
|
| 480 |
+
max(nvl(batch_number,0))+1
|
| 481 |
+
FROM {temp_table_schema}.{temp_table}_INCR where table_name = '{table}'
|
| 482 |
+
);"""
|
| 483 |
+
proc_query.append(sql)
|
| 484 |
|
| 485 |
+
return base_table, base_pk, proc_query, incr_join_grps, incr_table_join_info, incr_join, temp_table_schema
|
| 486 |
+
|
| 487 |
+
def create_df(query, table_df_mapping, table_usage_count):
|
| 488 |
+
script = []
|
| 489 |
+
query = ' '.join(query.split()).strip()
|
| 490 |
+
match = re.match(r'CREATE TABLE (\w+\.\w+\.\w+) AS (SELECT .+)', query, re.IGNORECASE)
|
| 491 |
+
source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
|
| 492 |
+
source_tables = [table for pair in source_tables for table in pair if table]
|
| 493 |
+
|
| 494 |
+
if not match:
|
| 495 |
+
raise ValueError('Invalid SQL')
|
| 496 |
+
table_name = match.group(1).split('.')[2]
|
| 497 |
+
select_statement = match.group(2)
|
| 498 |
+
create_script = f'{table_name} = spark.sql(""" {select_statement} """)'
|
| 499 |
+
persist_script = f'{table_name} = {table_name}.persist()'
|
| 500 |
+
view_script = f'{table_name}.createOrReplaceTempView("{table_name}")'
|
| 501 |
+
|
| 502 |
+
for table in source_tables:
|
| 503 |
+
create_script = create_script.replace(table, table_df_mapping[table])
|
| 504 |
+
|
| 505 |
+
script.append(f"\n\t\t######################---------Creating table {create_script.split('=')[0].strip()}-------############################")
|
| 506 |
+
script.append(create_script)
|
| 507 |
+
script.append(persist_script)
|
| 508 |
+
script.append(view_script)
|
| 509 |
+
script.append(f'''print("{create_script.split('=')[0].strip()} count: ", {create_script.split('=')[0].strip()}.count()''')
|
| 510 |
+
|
| 511 |
+
if 'INCR2_' in table_name:
|
| 512 |
+
x = table_name.split('INCR2_')[1]
|
| 513 |
+
if x in table_details_mapping.keys():
|
| 514 |
+
script.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################")
|
| 515 |
+
script.append(f"{x}_max_update_date = INCR2_{x}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]")
|
| 516 |
+
script.append(f"{x}_max_source_reference_date = INCR2_{x}.agg(max(to_timestamp('{table_details_mapping[x][1].replace(x+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]")
|
| 517 |
+
script.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{x}',{x}_max_update_date,{x}_max_source_reference_date, max_batch_id, config)")
|
| 518 |
+
script.append('\n')
|
| 519 |
+
|
| 520 |
+
for table in source_tables:
|
| 521 |
+
table_usage_count[table.split('.')[2]] -= 1
|
| 522 |
+
|
| 523 |
+
for table in source_tables:
|
| 524 |
+
if table_usage_count[table.split('.')[2]] == 0 and 'INCR1_' not in table:
|
| 525 |
+
unpersist_script = f"{table.split('.')[2]}.unpersist()"
|
| 526 |
+
script.append(unpersist_script)
|
| 527 |
+
|
| 528 |
+
return '\n\t\t'.join(script)
|
| 529 |
+
|
| 530 |
+
def generate_spark(proc_query, incr_join_grps, base_table, base_pk, incr_table_join_info, incr_join, temp_table_schema):
|
| 531 |
+
table_usage_count = defaultdict(int)
|
| 532 |
+
table_df_mapping = {}
|
| 533 |
+
|
| 534 |
+
for query in proc_query:
|
| 535 |
+
if 'CREATE TABLE' or 'DELETE' in query:
|
| 536 |
+
source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
|
| 537 |
+
source_tables = [table for pair in source_tables for table in pair if table]
|
| 538 |
+
for table in source_tables:
|
| 539 |
+
table_usage_count[table.split('.')[2]] += 1
|
| 540 |
+
if 'DELETE' not in query:
|
| 541 |
+
table_df_mapping[table] = table.split('.')[2]
|
| 542 |
+
|
| 543 |
+
script = []
|
| 544 |
+
for query in proc_query:
|
| 545 |
+
if 'CREATE TABLE' in query:
|
| 546 |
+
script.append(create_df(query, table_df_mapping,table_usage_count))
|
| 547 |
+
|
| 548 |
+
spark_query = []
|
| 549 |
+
spark_query.append("\t\t######################---------Reading source data -------############################")
|
| 550 |
+
for table in table_details_mapping.keys():
|
| 551 |
+
spark_query.append(f'{table} = read_file(spark, config, \"{table}\").filter("{table_details_mapping[table][2]}")')
|
| 552 |
+
spark_query.append(f'{table} = {table}.persist()')
|
| 553 |
+
spark_query.append(f'{table}.createOrReplaceTempView("{table}")')
|
| 554 |
+
spark_query.append(f'print("{table} count: ", {table}.count()')
|
| 555 |
+
spark_query.append('\n')
|
| 556 |
+
|
| 557 |
+
spark_query.append("\n\t\t######################---------Reading records-------############################")
|
| 558 |
+
for table in table_details_mapping.keys():
|
| 559 |
+
spark_query.append(f"{table}_max_update_date = read_max_update_date(redshift_conn, config['application_name'],'{table}', config)")
|
| 560 |
+
spark_query.append(f'{table}_max_update_date = {table}_max_update_date[0][0]')
|
| 561 |
+
spark_query.append('\n')
|
| 562 |
+
|
| 563 |
+
incr1_spark = []
|
| 564 |
+
temp_incr1 = []
|
| 565 |
+
for _, group in incr_join_grps:
|
| 566 |
+
for table in _.split():
|
| 567 |
+
if base_table != table:
|
| 568 |
+
join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
|
| 569 |
+
join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
|
| 570 |
+
join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
|
| 571 |
+
join_cond = split_join_condition(incr_table_join_info[table])
|
| 572 |
+
join_condition = join_incr(join_cond)
|
| 573 |
+
source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
|
| 574 |
+
|
| 575 |
+
join_key_list = []
|
| 576 |
+
for x in join_keys:
|
| 577 |
+
join_key_list.append(f'{base_table}.{x}')
|
| 578 |
+
join_key = ', '.join(join_key_list)
|
| 579 |
+
|
| 580 |
+
for y in source_table:
|
| 581 |
+
sql = f"""SELECT {join_key} FROM {base_table} {base_table}"""
|
| 582 |
+
|
| 583 |
+
incr_join_text = ''
|
| 584 |
+
i=0
|
| 585 |
+
for i in range(len(join_condition)):
|
| 586 |
+
sql += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 587 |
+
incr_join_text += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
|
| 588 |
+
|
| 589 |
+
sql += f''' WHERE {join_tables[i+1]}._hoodie_commit_time > cast('"""+str({join_tables[i+1]}_max_update_date)+"""' as timestamp)'''
|
| 590 |
+
temp_incr1.append(sql)
|
| 591 |
+
|
| 592 |
+
else:
|
| 593 |
+
source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
|
| 594 |
+
join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')]
|
| 595 |
+
|
| 596 |
+
join_key_list = []
|
| 597 |
+
for x in join_keys:
|
| 598 |
+
join_key_list.append(f'{base_table}.{x}')
|
| 599 |
+
join_key = ', '.join(join_key_list)
|
| 600 |
+
|
| 601 |
+
sql = f'''SELECT {join_key} FROM {base_table} {base_table} WHERE {base_table}._hoodie_commit_time > cast('"""+str({base_table}_max_update_date)+"""' as timestamp)'''
|
| 602 |
+
incr1_spark.append(sql)
|
| 603 |
+
for i in temp_incr1:
|
| 604 |
+
incr1_spark.append(i)
|
| 605 |
+
incr1_spark = '\nUNION\n'.join(incr1_spark)
|
| 606 |
+
spark_query.append("\n\t\t######################---------Creating INCR1-------############################")
|
| 607 |
+
spark_query.append(f'INCR1_{temp_table} = spark.sql(""" {incr1_spark} """)')
|
| 608 |
+
spark_query.append(f'\n\t\tINCR1_{temp_table} = INCR1_{temp_table}.dropDuplicates()')
|
| 609 |
+
spark_query.append(f'INCR1_{temp_table} = INCR1_{temp_table}.persist()')
|
| 610 |
+
spark_query.append(f'INCR1_{temp_table}.createOrReplaceTempView("INCR1_{temp_table}")')
|
| 611 |
+
spark_query.append(f'print("INCR1_{temp_table} count: ", INCR1_{temp_table}.count())')
|
| 612 |
+
|
| 613 |
+
spark_query.append("\n\t\t######################---------Creating INCR2-------############################")
|
| 614 |
+
for table in table_details_mapping.keys():
|
| 615 |
+
if table in incr_join.keys():
|
| 616 |
+
p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
|
| 617 |
+
if len(p) > 1:
|
| 618 |
+
spark_query.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################")
|
| 619 |
+
spark_query.append(f"{table}_max_update_date = {table}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]")
|
| 620 |
+
spark_query.append(f"{table}_max_source_reference_date = {table}.agg(max(to_timestamp('{table_details_mapping[table][1].replace(table+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]")
|
| 621 |
+
spark_query.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{table}',{table}_max_update_date,{table}_max_source_reference_date, max_batch_id, config)")
|
| 622 |
+
spark_query.append('\n')
|
| 623 |
+
|
| 624 |
+
for query in script:
|
| 625 |
+
spark_query.append(query)
|
| 626 |
+
spark_query.append('\n')
|
| 627 |
+
|
| 628 |
+
spark_query1 = []
|
| 629 |
+
spark_query1.append('\n')
|
| 630 |
+
for query in proc_query:
|
| 631 |
+
if f'{temp_table_schema}.{temp_table}\n' in query:
|
| 632 |
+
final_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
|
| 633 |
+
final_tables = [table.split('.')[2].strip() for pair in final_tables for table in pair if table and table.split('.')[2].strip() != temp_table][0]
|
| 634 |
+
if 'INCR1_' in final_tables:
|
| 635 |
+
spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['incr2df_path'])")
|
| 636 |
+
else:
|
| 637 |
+
spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['resultdf_path'])")
|
| 638 |
+
spark_query1.append(f'''cur.execute(""" {query} """)''')
|
| 639 |
+
spark_query1.append('\n')
|
| 640 |
+
|
| 641 |
+
with open('template.txt') as file:
|
| 642 |
+
template = file.read()
|
| 643 |
+
|
| 644 |
+
result = template.replace('INSERT_CODE_1', '\n\t\t'.join(spark_query))
|
| 645 |
+
result = result.replace('INSERT_CODE_2', '\t\t'.join(spark_query1))
|
| 646 |
+
|
| 647 |
+
return result
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
# st.set_page_config(page_title='AUTOMATED SOURCE TO TARGET MAPPING', layout= 'wide')
|
| 652 |
+
# st.markdown("""
|
| 653 |
+
# <style>
|
| 654 |
+
|
| 655 |
+
# /* Remove blank space at top and bottom */
|
| 656 |
+
# .block-container {
|
| 657 |
+
# padding-top: 1.9rem;
|
| 658 |
+
# padding-bottom: 1rem;
|
| 659 |
+
# }
|
| 660 |
+
|
| 661 |
+
# /* Remove blank space at the center canvas */
|
| 662 |
+
# .st-emotion-cache-z5fcl4 {
|
| 663 |
+
# position: relative;
|
| 664 |
+
# top: -62px;
|
| 665 |
+
# }
|
| 666 |
+
|
| 667 |
+
# /* Make the toolbar transparent and the content below it clickable */
|
| 668 |
+
# .st-emotion-cache-18ni7ap {
|
| 669 |
+
# pointer-events: none;
|
| 670 |
+
# background: rgb(255 255 255 / 0%)
|
| 671 |
+
# }
|
| 672 |
+
# .st-emotion-cache-zq5wmm {
|
| 673 |
+
# pointer-events: auto;
|
| 674 |
+
# background: rgb(255 255 255);
|
| 675 |
+
# border-radius: 5px;
|
| 676 |
+
# }
|
| 677 |
+
# </style>
|
| 678 |
+
# """, unsafe_allow_html=True)
|
| 679 |
+
|
| 680 |
+
######
|
| 681 |
+
def main():
|
| 682 |
+
# st.title('PAGE TITLE') # Change this for each page
|
| 683 |
+
sidebar()
|
| 684 |
+
########
|
| 685 |
st.subheader('AUTOMATED SOURCE TO TARGET MAPPING')
|
| 686 |
mode= st.selectbox('Select Mode of Mapping',('Supervised Mapping(You Have Sufficient Sample Data in Target Template)', 'Unsupervised Mapping(You Do Not Have Sufficient Sample Data in Target Template)'), index=None,placeholder='Select category of table')
|
| 687 |
if mode == 'Supervised Mapping(You Have Sufficient Sample Data in Target Template)':
|