Spaces:

pwc-india
/

data_discovery_source_target_mapping

Build error

App Files Files Community

pwc-india commited on Oct 16, 2024

Commit

ba3fa23

verified ·

1 Parent(s): 0940575

Update pages/5SOURCE TO TARGET MAPPING.py

Browse files

Files changed (1) hide show

pages/5SOURCE TO TARGET MAPPING.py +597 -595

pages/5SOURCE TO TARGET MAPPING.py CHANGED Viewed

@@ -54,632 +54,634 @@ st.markdown("""
                }
     </style>
     """, unsafe_allow_html=True)
-######
-def main():
-    # st.title('PAGE TITLE')  # Change this for each page
-    sidebar()
-########
-    def read_excel(path, sheet):
-        df = pd.read_excel(path, sheet_name = sheet, dtype = 'str')
-        return df
-    def split_join_condition(join_condition):
-        conditions = []
-        condition = ''
-        bracket_count = 0
-        for char in join_condition:
-            if char == '(':
-                bracket_count += 1
-            elif char == ')':
-                bracket_count -+ 1
-            if char == ',' and bracket_count == 0:
-                conditions.append(condition.strip())
-                condition = ''
-            else:
-                condition += char
-        if condition:
             conditions.append(condition.strip())
-        return conditions
-    def join_incr(join_conditions):
-        join = []
-        join_pattern = re.compile(r'(\w+\.\w+)\s*=\s*(\w+\w.\w+)', re.IGNORECASE)
-        for join_condition in join_conditions:
-            parts = re.split(r'\sAND\s|\sOR\s', join_condition, flags = re.IGNORECASE)
-            temp = [x.strip() for x in parts if join_pattern.match(x.strip())]
-            join.append(' AND '.join(temp))
-        return join
-    def generate_sql(temp_table):
-        proc_query = []
-        base_table = None
-        source_table_schema = 'MAIN.GOLD'
-        temp_table_schema = 'MAIN.GOLD'
-        base_pk = []
-        join_fields = set()
-        for _,row in df.iterrows():
-            source_table = row['Source Table']
-            primary_key = row['Primary Key']
-            source_column = row['Source Column']
-            alias = row['Alias']
-            joining_keys = row['Joining Keys']
-            if not base_table:
-                if primary_key == 'Y':
-                    base_table = source_table
-                    base_pk.append(joining_keys)
-            if pd.notna(joining_keys):
-                keys = [x.strip() for x in joining_keys.split(',')]
-                for x in keys:
-                    if x not in join_fields:
-                        join_fields.add(x)
-        unique_cols = ['Source Table', 'Joining Keys', 'Primary Key', 'Join Type','Join Tables','Join Condition']
-        unique_df = df.drop_duplicates(subset = unique_cols)
-        incremantal_mapping = {}
-        incr_joins = {}
-        for _,row in unique_df.iterrows():
-            source_table = row['Source Table']
-            source_column = row['Source Column']
-            joining_keys = row['Joining Keys']
-            primary_key = row['Primary Key']
-            direct_derived = row['Direct/Derived']
-            join_type = row['Join Type']
-            join_tables = row['Join Tables']
-            join_condition = row['Join Condition']
-            if source_table == base_table:
-                if primary_key == 'Y':
-                    key = (source_table, joining_keys, join_type, join_tables, join_condition)
-                    key1 = source_table
-                else:
-                    continue
-            else:
                 key = (source_table, joining_keys, join_type, join_tables, join_condition)
                 key1 = source_table
-            if pd.notna(direct_derived) and pd.notna(source_table) and pd.notna(source_column):
-                if key not in incremantal_mapping:
-                    incremantal_mapping[key] = {
-                        'source_table': source_table,
-                        'joining_keys':joining_keys,
                         'join_type': join_type,
-                        'join_tables': join_tables,
                         'join_condition': join_condition
                     }
-                if key1 not in incr_joins:
-                    if pd.notna(direct_derived) and direct_derived == 'DERIVED':
-                        incr_joins[key1] = {
-                            'join_type': join_type,
-                            'join_tables': ', '.join([x.strip() for x in join_tables.split(',') if x != base_table]),
-                            'join_condition': join_condition
-                        }
-        incremental_df = pd.DataFrame(incremantal_mapping.values())
-        incr_join_grps = incremental_df.groupby(['source_table'])
-        proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.{temp_table}_INCR;')
-        incr_table_join_info = {}
-        for _,row in incremental_df.iterrows():
-            source_table = row['source_table']
-            if source_table != base_table:
-                joining_keys = row['joining_keys']
-                join_type = row['join_type']
-                join_tables = [x.strip() for x in row['join_tables'].split(',')]
-                index = join_tables.index(source_table)
-                join_condition = [x.strip() for x in row['join_condition'].split(',')][0:index]
-                incr_table_join_info[source_table] = ', '.join(join_condition)
-        incr_query = []
-        incr_cols = ''
-        incr_tables = []
-        incr_join = {}
-        for _, group in incr_join_grps:
-            for table in _.split():
-                if base_table != table:
-                    join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
-                    join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
-                    join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
-                    join_cond = split_join_condition(incr_table_join_info[table])
-                    join_condition = join_incr(join_cond)
-                    source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
-                    join_key_list = []
-                    for x in join_keys:
-                        join_key_list.append(f'{base_table}.{x}')
-                    join_key = ', '.join(join_key_list)
-                    for y in source_table:
-                        sql = f"""
-        INSERT INTO {temp_table_schema}.{temp_table}_INCR
-        (
-            SELECT {join_key}, {table_details_mapping[y][0]}, {table_details_mapping[y][1]}, '{y}', 1,  CURRENT_TIMESTAMP
-            FROM {source_table_schema}.{base_table} {base_table}"""
-                        incr_join_text = ''
-                        for i in range(len(join_condition)):
-                            sql += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
-                            incr_join_text += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
-                        incr_join[y] = incr_join_text
-                        sql += f"""
-            WHERE COALESCE({join_tables[i+1]}.operation,'NA') <> 'D'
-            AND TO_TIMESTAMP( CAST(SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),1,4) || '-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),5,2) ||'-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),7,2) || ' ' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),9,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),11,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')  > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{y}')
-        );"""
-                        incr_query.append(sql)
-                        incr_tables.append(y)
-                else:
-                    source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
-                    join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')]
-                    join_key_list = []
-                    for x in join_keys:
-                        join_key_list.append(f'{base_table}.{x}')
-                    join_key = ', '.join(join_key_list)
-                    incr_cols = join_key
                     sql = f"""
-        INSERT INTO {temp_table_schema}.{temp_table}_INCR
-        (
-            SELECT {join_key}, {table_details_mapping[base_table][0]}, {table_details_mapping[base_table][1]}, '{base_table}', 1,  CURRENT_TIMESTAMP
-            FROM {source_table_schema}.{base_table} {base_table}
-            WHERE COALESCE(operation,'NA') <> 'D'
-            AND TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')  > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{base_table}')
-        );"""
-                    proc_query.append(sql)
-                    incr_tables.append(base_table)
-        proc_query.append('\n'.join(incr_query))
-        proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.INCR1_{temp_table};')
-        sql = f"""
-        INSERT INTO {temp_table_schema}.INCR1_{temp_table}
-        (
-            SELECT DISTINCT {incr_cols.replace(f'{base_table}.', '')}
-            FROM {temp_table_schema}.{temp_table}_INCR
-        );"""
-        proc_query.append(sql)
-        incr_table_dict = {}
-        for table in incr_tables:
-            if table == base_table:
-                incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
-            else:
-                p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
-                if len(p) == 1:
-                    incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
-                else:
-                    incr_table_dict[table] = f'{source_table_schema}.{table}'
-        s = []
-        for table in incr_tables:
-            incr2_sql_list = []
-            if table == base_table:
-                for key in incr_cols.replace(f'{base_table}.', '').split(','):
-                    incr2_sql_list.append(f"{base_table}.{key} = A.{key}")
-                incr2_sql_join = ' AND '.join(incr2_sql_list)
                 sql = f"""
-        CREATE TABLE {temp_table_schema}.INCR2_{table}
-        AS
-            SELECT
-                {table}.*
-            FROM
-                {source_table_schema}.{table} {table}
-            INNER JOIN
-                {temp_table_schema}.INCR1_{temp_table} A ON {incr2_sql_join}; """
-                proc_query.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};')
                 proc_query.append(sql)
             else:
-                p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
-                if len(p) == 1:
-                    sql = f"""
-        CREATE TABLE {temp_table_schema}.INCR2_{table}
-        AS
-            SELECT
-                {table}.*
-            FROM
-                {temp_table_schema}.INCR2_{base_table} {base_table} {incr_join[table]};"""
-                    s.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};')
-                    s.append(sql)
-        for x in s:
-            proc_query.append(x)
-        select_clause = []
-        from_clause = []
-        where_clause = []
-        for _,row in df.iterrows():
-            field_name = row['Field_Name']
-            source_table = row['Source Table']
-            source_column = row['Source Column']
-            joining_keys = row['Joining Keys']
-            primary_key = row['Primary Key']
-            direct_derived = row['Direct/Derived']
-            join_type = row['Join Type']
-            join_tables = row['Join Tables']
-            join_condition = row['Join Condition']
-            column_operation = row['Column Operations']
-            alias = row['Alias']
-            granularity = row['Granularity']
-            filter_condition = row['Filter Condition']
-            clauses = row['Clauses']
-            ordering = row['Ordering']
-            if pd.notna(direct_derived):
-                if pd.notna(column_operation):
-                    if len(column_operation.split()) == 1:
-                        select_expr = f'{column_operation.upper()}({source_table}.{source_column})'
-                    else:
-                        select_expr = column_operation
-                else:
-                    if pd.notna(source_table):
-                        select_expr = f'{source_table}.{source_column}'
-                    else:
-                        select_expr = source_column
-                if source_column not in join_fields:
-                    if pd.notna(alias):
-                        select_expr += f' AS {alias}'
-                else:
-                    if pd.notna(column_operation) and pd.notna(source_column):
-                        select_expr += f' AS {source_column}'
-                if direct_derived.upper() == 'DIRECT':
-                    select_clause.append(select_expr)
-                elif direct_derived.upper() == 'DERIVED_BASE':
-                    select_clause.append(select_expr)
-                if pd.notna(filter_condition):
-                    where_clause.append(filter_condition)
-        select_query = ',\n\t'.join(select_clause)
-        sql_query = f"CREATE TABLE {temp_table_schema}.{base_table}_BASE\nAS \n\tSELECT \n\t{select_query} \nFROM\n\t{incr_table_dict[base_table]} {base_table}"
-        if where_clause:
-            sql_query += f"\nWHERE {' AND'.join(where_clause)}"
-        sql_query += ';'
-        proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{base_table}_BASE;")
-        proc_query.append(sql_query)
-        df['Clauses'].fillna('', inplace = True)
-        df['Ordering'].fillna('', inplace = True)
-        c = 1
-        temp_base_table = f'{base_table}_BASE'
-        grp_cols = ['Join Condition', 'Clauses', 'Ordering']
-        join_grps = df[df['Direct/Derived'] == 'DERIVED'].groupby(['Join Condition', 'Clauses', 'Ordering'])
-        temp_tables_sql = []
-        for (join_condition,clauses,ordering), group in join_grps:
-            if pd.notna(group['Direct/Derived'].iloc[0]):
-                if group['Direct/Derived'].iloc[0].upper() == 'DERIVED':
-                    join_tables = [t.strip() for t in group['Join Tables'].iloc[0].split(',')]
-                    join_keys = [t.strip() for t in group['Joining Keys'].iloc[0].split(',')]
-                    join_type = [t.strip() for t in group['Join Type'].iloc[0].split(',')]
-                    join_condition = split_join_condition(group['Join Condition'].iloc[0])
-                    temp_table_name = f"TEMP_{group['Source Table'].iloc[0]}"
-                    source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
-                    alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
-                    source_table = [t.strip() for t in (','.join(group['Source Table'])).split(',')]
-                    base_cols = []
-                    for join_key in join_keys:
-                        base_cols.append(f'{join_tables[0]}.{join_key}')
-                    for s_table,col,alias in zip(source_table,source_column,alias):
-                        if pd.notna(group['Column Operations'].iloc[0]):
-                            if len(group['Column Operations'].iloc[0].split()) == 1:
-                                select_expr = f"{group['Column Operations'].iloc[0].upper()}({s_table}.{col})"
-                            else:
-                                select_expr = group['Column Operations'].iloc[0]
-                        else:
-                            if pd.notna(s_table):
-                                select_expr = f"{s_table}.{col}"
-                            else:
-                                select_expr = col
-                        if alias:
-                            select_expr += f" AS {alias}"
-                            base_cols.append(select_expr)
-                    if ordering:
-                        base_cols.append(f"{ordering} AS RN")
-                    sql = ',\n\t\t'.join(base_cols)
-                    join_sql = f"SELECT \n\t\t{sql} \nFROM\n\t{incr_table_dict[base_table]} {join_tables[0]}"
-                    for i in range(len(join_type)):
-                        join_sql += f'\n\t{join_type[i]} JOIN {incr_table_dict[join_tables[i+1]]} {join_tables[i+1]} ON {join_condition[i]}'
-                    if clauses:
-                        join_sql += f'\n\t{clauses}'
-                    join_sql += ';'
-                    proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{temp_table_name};")
-                    proc_query.append(f"CREATE TABLE {temp_table_schema}.{temp_table_name}\nAS \n\t{join_sql}")
-                    granularity = [t.strip() for t in group['Granularity'].iloc[0].split(',')]
-                    sql = []
-                    for key in join_keys:
-                        sql.append(f"A.{key} = B.{key}")
-                    temp_cols = []
-                    temp_cols.append('A.*')
-                    source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
-                    alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
-                    for col,alias in zip(source_column,alias):
-                        select_expr = f"B.{col}"
-                        if alias:
-                            select_expr = f"B.{alias}"
-                        else:
-                            select_expr = f"B.{col}"
-                        temp_cols.append(select_expr)
-                    temp_select_query = ',\n\t\t'.join(temp_cols)
-                    proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.TEMP_{temp_table}_{c};")
-                    base_sql = f"CREATE TABLE {temp_table_schema}.TEMP_{temp_table}_{c}\nAS \n\tSELECT \n\t\t{temp_select_query} \nFROM\n\t{temp_table_schema}.{temp_base_table} AS A"
-                    base_sql += f"\n\tLEFT OUTER JOIN {temp_table_schema}.{temp_table_name} B ON {' AND '.join(sql)}"
-                    if '1:1' in granularity and len(ordering) > 1:
-                        base_sql += f" AND B.RN = 1"
-                    base_sql += ';'
-                    temp_base_table = f'TEMP_{temp_table}_{c}'
-                    c += 1
-                    proc_query.append(base_sql)
-        fin_table_name = temp_table
-        fin_table_cols = []
-        for _,row in df.iterrows():
-            field_name = row['Field_Name']
-            source_table = row['Source Table']
-            source_column = row['Source Column']
-            alias = row['Alias']
-            if pd.notna(row['Direct/Derived']):
-                if (source_column in join_fields):
-                    fin_table_cols.append(f'{source_column} AS "{field_name}"')
-                else:
-                    fin_table_cols.append(f'"{field_name}"')
-        fin_table_cols = ',\n\t\t'.join(fin_table_cols)
-        fin_sql = f"INSERT INTO {temp_table_schema}.{fin_table_name}\n\tSELECT \n\t\t{fin_table_cols} \nFROM\n\t{temp_table_schema}.TEMP_{temp_table}_{c-1};"
-        condition_col = '_'.join(incr_cols.replace(f'{base_table}.', '').split(','))
-        proc_query.append(f"DELETE FROM {temp_table_schema}.{fin_table_name}\nWHERE {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} IN  (SELECT {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} FROM {temp_table_schema}.INCR1_{temp_table});")
-        proc_query.append(fin_sql)
-        for table in incr_tables:
             sql = f"""
-        INSERT INTO audit.reportingdb_audit_tbl_{temp_table}
-        (
         SELECT
-            '{temp_table}' as mart_table_name,
-            '{table}' as src_table_name,
-            coalesce( max(TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')),(select max(max_update_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) max_update_date,
-            CURRENT_TIMESTAMP as load_timestamp,
-            coalesce(max(prev_updt_ts),(select max(source_reference_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) AS source_reference_date,
-            max(nvl(batch_number,0))+1
-        FROM {temp_table_schema}.{temp_table}_INCR where table_name = '{table}'
-        );"""
             proc_query.append(sql)
-        return base_table, base_pk, proc_query, incr_join_grps, incr_table_join_info, incr_join, temp_table_schema
-    def create_df(query, table_df_mapping, table_usage_count):
-        script = []
-        query = ' '.join(query.split()).strip()
-        match = re.match(r'CREATE TABLE (\w+\.\w+\.\w+) AS (SELECT .+)', query, re.IGNORECASE)
-        source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
-        source_tables = [table for pair in source_tables for table in pair if table]
-        if not match:
-            raise ValueError('Invalid SQL')
-        table_name = match.group(1).split('.')[2]
-        select_statement = match.group(2)
-        create_script = f'{table_name} = spark.sql(""" {select_statement} """)'
-        persist_script = f'{table_name} = {table_name}.persist()'
-        view_script = f'{table_name}.createOrReplaceTempView("{table_name}")'
-        for table in source_tables:
-            create_script = create_script.replace(table, table_df_mapping[table])
-        script.append(f"\n\t\t######################---------Creating table {create_script.split('=')[0].strip()}-------############################")
-        script.append(create_script)
-        script.append(persist_script)
-        script.append(view_script)
-        script.append(f'''print("{create_script.split('=')[0].strip()} count: ", {create_script.split('=')[0].strip()}.count()''')
-        if 'INCR2_' in table_name:
-            x = table_name.split('INCR2_')[1]
-            if x in table_details_mapping.keys():
-                script.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################")
-                script.append(f"{x}_max_update_date = INCR2_{x}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]")
-                script.append(f"{x}_max_source_reference_date = INCR2_{x}.agg(max(to_timestamp('{table_details_mapping[x][1].replace(x+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]")
-                script.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{x}',{x}_max_update_date,{x}_max_source_reference_date, max_batch_id, config)")
-        script.append('\n')
-        for table in source_tables:
-            table_usage_count[table.split('.')[2]] -= 1
-        for table in source_tables:
-            if table_usage_count[table.split('.')[2]] == 0 and 'INCR1_' not in table:
-                unpersist_script = f"{table.split('.')[2]}.unpersist()"
-                script.append(unpersist_script)
-        return '\n\t\t'.join(script)
-    def generate_spark(proc_query, incr_join_grps, base_table, base_pk, incr_table_join_info, incr_join, temp_table_schema):
-        table_usage_count =  defaultdict(int)
-        table_df_mapping = {}
-        for query in proc_query:
-            if 'CREATE TABLE' or 'DELETE' in query:
-                source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
-                source_tables = [table for pair in source_tables for table in pair if table]
-                for table in source_tables:
-                    table_usage_count[table.split('.')[2]] += 1
-                    if 'DELETE' not in query:
-                        table_df_mapping[table] = table.split('.')[2]
-        script = []
-        for query in proc_query:
-            if 'CREATE TABLE' in query:
-                script.append(create_df(query, table_df_mapping,table_usage_count))
-        spark_query = []
-        spark_query.append("\t\t######################---------Reading source data -------############################")
-        for table in table_details_mapping.keys():
-            spark_query.append(f'{table} = read_file(spark, config, \"{table}\").filter("{table_details_mapping[table][2]}")')
-            spark_query.append(f'{table} = {table}.persist()')
-            spark_query.append(f'{table}.createOrReplaceTempView("{table}")')
-            spark_query.append(f'print("{table} count: ", {table}.count()')
-            spark_query.append('\n')
-        spark_query.append("\n\t\t######################---------Reading records-------############################")
-        for table in table_details_mapping.keys():
-            spark_query.append(f"{table}_max_update_date = read_max_update_date(redshift_conn, config['application_name'],'{table}', config)")
-            spark_query.append(f'{table}_max_update_date = {table}_max_update_date[0][0]')
-            spark_query.append('\n')
-        incr1_spark = []
-        temp_incr1 = []
-        for _, group in incr_join_grps:
-            for table in _.split():
-                if base_table != table:
-                    join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
-                    join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
-                    join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
-                    join_cond = split_join_condition(incr_table_join_info[table])
-                    join_condition = join_incr(join_cond)
-                    source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
-                    join_key_list = []
-                    for x in join_keys:
-                        join_key_list.append(f'{base_table}.{x}')
-                    join_key = ', '.join(join_key_list)
-                    for y in source_table:
-                        sql = f"""SELECT {join_key} FROM {base_table} {base_table}"""
-                        incr_join_text = ''
-                        i=0
-                        for i in range(len(join_condition)):
-                            sql += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
-                            incr_join_text += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
-                        sql += f''' WHERE {join_tables[i+1]}._hoodie_commit_time > cast('"""+str({join_tables[i+1]}_max_update_date)+"""' as timestamp)'''
-                        temp_incr1.append(sql)
-                else:
-                    source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
-                    join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')]
-                    join_key_list = []
-                    for x in join_keys:
-                        join_key_list.append(f'{base_table}.{x}')
-                    join_key = ', '.join(join_key_list)
-                    sql = f'''SELECT {join_key} FROM {base_table} {base_table} WHERE {base_table}._hoodie_commit_time > cast('"""+str({base_table}_max_update_date)+"""' as timestamp)'''
-                    incr1_spark.append(sql)
-        for i in temp_incr1:
-            incr1_spark.append(i)
-        incr1_spark = '\nUNION\n'.join(incr1_spark)
-        spark_query.append("\n\t\t######################---------Creating INCR1-------############################")
-        spark_query.append(f'INCR1_{temp_table} = spark.sql(""" {incr1_spark} """)')
-        spark_query.append(f'\n\t\tINCR1_{temp_table} = INCR1_{temp_table}.dropDuplicates()')
-        spark_query.append(f'INCR1_{temp_table} = INCR1_{temp_table}.persist()')
-        spark_query.append(f'INCR1_{temp_table}.createOrReplaceTempView("INCR1_{temp_table}")')
-        spark_query.append(f'print("INCR1_{temp_table} count: ", INCR1_{temp_table}.count())')
-        spark_query.append("\n\t\t######################---------Creating INCR2-------############################")
-        for table in table_details_mapping.keys():
-            if table in incr_join.keys():
-                p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
-                if len(p) > 1:
-                    spark_query.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################")
-                    spark_query.append(f"{table}_max_update_date = {table}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]")
-                    spark_query.append(f"{table}_max_source_reference_date = {table}.agg(max(to_timestamp('{table_details_mapping[table][1].replace(table+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]")
-                    spark_query.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{table}',{table}_max_update_date,{table}_max_source_reference_date, max_batch_id, config)")
-                    spark_query.append('\n')
-        for query in script:
-            spark_query.append(query)
-            spark_query.append('\n')
-        spark_query1 = []
-        spark_query1.append('\n')
-        for query in proc_query:
-            if f'{temp_table_schema}.{temp_table}\n' in query:
-                final_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
-                final_tables = [table.split('.')[2].strip() for pair in final_tables for table in pair if table and table.split('.')[2].strip() != temp_table][0]
-                if 'INCR1_' in final_tables:
-                    spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['incr2df_path'])")
-                else:
-                    spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['resultdf_path'])")
-                spark_query1.append(f'''cur.execute(""" {query} """)''')
-                spark_query1.append('\n')
-        with open('template.txt') as file:
-            template = file.read()
-        result = template.replace('INSERT_CODE_1', '\n\t\t'.join(spark_query))
-        result = result.replace('INSERT_CODE_2', '\t\t'.join(spark_query1))
-        return result
-    # st.set_page_config(page_title='AUTOMATED SOURCE TO TARGET MAPPING', layout= 'wide')
-    # st.markdown("""
-    #     <style>
-    #            /* Remove blank space at top and bottom */
-    #            .block-container {
-    #                padding-top: 1.9rem;
-    #                padding-bottom: 1rem;
-    #             }
-    #            /* Remove blank space at the center canvas */
-    #            .st-emotion-cache-z5fcl4 {
-    #                position: relative;
-    #                top: -62px;
-    #                }
-    #            /* Make the toolbar transparent and the content below it clickable */
-    #            .st-emotion-cache-18ni7ap {
-    #                pointer-events: none;
-    #                background: rgb(255 255 255 / 0%)
-    #                }
-    #            .st-emotion-cache-zq5wmm {
-    #                pointer-events: auto;
-    #                background: rgb(255 255 255);
-    #                border-radius: 5px;
-    #                }
-    #     </style>
-    #     """, unsafe_allow_html=True)
     st.subheader('AUTOMATED SOURCE TO TARGET MAPPING')
     mode= st.selectbox('Select Mode of Mapping',('Supervised Mapping(You Have Sufficient Sample Data in Target Template)', 'Unsupervised Mapping(You Do Not Have Sufficient Sample Data in Target Template)'), index=None,placeholder='Select category of table')
     if mode == 'Supervised Mapping(You Have Sufficient Sample Data in Target Template)':

                }
     </style>
     """, unsafe_allow_html=True)
+def read_excel(path, sheet):
+    df = pd.read_excel(path, sheet_name = sheet, dtype = 'str')
+    return df
+def split_join_condition(join_condition):
+    conditions = []
+    condition = ''
+    bracket_count = 0
+    for char in join_condition:
+        if char == '(':
+            bracket_count += 1
+        elif char == ')':
+            bracket_count -+ 1
+        if char == ',' and bracket_count == 0:
             conditions.append(condition.strip())
+            condition = ''
+        else:
+            condition += char
+    if condition:
+        conditions.append(condition.strip())
+    return conditions
+def join_incr(join_conditions):
+    join = []
+    join_pattern = re.compile(r'(\w+\.\w+)\s*=\s*(\w+\w.\w+)', re.IGNORECASE)
+    for join_condition in join_conditions:
+        parts = re.split(r'\sAND\s|\sOR\s', join_condition, flags = re.IGNORECASE)
+        temp = [x.strip() for x in parts if join_pattern.match(x.strip())]
+        join.append(' AND '.join(temp))
+    return join
+def generate_sql(temp_table):
+    proc_query = []
+    base_table = None
+    source_table_schema = 'MAIN.GOLD'
+    temp_table_schema = 'MAIN.GOLD'
+    base_pk = []
+    join_fields = set()
+    for _,row in df.iterrows():
+        source_table = row['Source Table']
+        primary_key = row['Primary Key']
+        source_column = row['Source Column']
+        alias = row['Alias']
+        joining_keys = row['Joining Keys']
+        if not base_table:
+            if primary_key == 'Y':
+                base_table = source_table
+                base_pk.append(joining_keys)
+        if pd.notna(joining_keys):
+            keys = [x.strip() for x in joining_keys.split(',')]
+            for x in keys:
+                if x not in join_fields:
+                    join_fields.add(x)
+    unique_cols = ['Source Table', 'Joining Keys', 'Primary Key', 'Join Type','Join Tables','Join Condition']
+    unique_df = df.drop_duplicates(subset = unique_cols)
+    incremantal_mapping = {}
+    incr_joins = {}
+    for _,row in unique_df.iterrows():
+        source_table = row['Source Table']
+        source_column = row['Source Column']
+        joining_keys = row['Joining Keys']
+        primary_key = row['Primary Key']
+        direct_derived = row['Direct/Derived']
+        join_type = row['Join Type']
+        join_tables = row['Join Tables']
+        join_condition = row['Join Condition']
+        if source_table == base_table:
+            if primary_key == 'Y':
                 key = (source_table, joining_keys, join_type, join_tables, join_condition)
                 key1 = source_table
+            else:
+                continue
+        else:
+            key = (source_table, joining_keys, join_type, join_tables, join_condition)
+            key1 = source_table
+        if pd.notna(direct_derived) and pd.notna(source_table) and pd.notna(source_column):
+            if key not in incremantal_mapping:
+                incremantal_mapping[key] = {
+                    'source_table': source_table,
+                    'joining_keys':joining_keys,
+                    'join_type': join_type,
+                    'join_tables': join_tables,
+                    'join_condition': join_condition
+                }
+            if key1 not in incr_joins:
+                if pd.notna(direct_derived) and direct_derived == 'DERIVED':
+                    incr_joins[key1] = {
                         'join_type': join_type,
+                        'join_tables': ', '.join([x.strip() for x in join_tables.split(',') if x != base_table]),
                         'join_condition': join_condition
                     }
+    incremental_df = pd.DataFrame(incremantal_mapping.values())
+    incr_join_grps = incremental_df.groupby(['source_table'])
+    proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.{temp_table}_INCR;')
+    incr_table_join_info = {}
+    for _,row in incremental_df.iterrows():
+        source_table = row['source_table']
+        if source_table != base_table:
+            joining_keys = row['joining_keys']
+            join_type = row['join_type']
+            join_tables = [x.strip() for x in row['join_tables'].split(',')]
+            index = join_tables.index(source_table)
+            join_condition = [x.strip() for x in row['join_condition'].split(',')][0:index]
+            incr_table_join_info[source_table] = ', '.join(join_condition)
+    incr_query = []
+    incr_cols = ''
+    incr_tables = []
+    incr_join = {}
+    for _, group in incr_join_grps:
+        for table in _.split():
+            if base_table != table:
+                join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
+                join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
+                join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
+                join_cond = split_join_condition(incr_table_join_info[table])
+                join_condition = join_incr(join_cond)
+                source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
+                join_key_list = []
+                for x in join_keys:
+                    join_key_list.append(f'{base_table}.{x}')
+                join_key = ', '.join(join_key_list)
+                for y in source_table:
                     sql = f"""
+    INSERT INTO {temp_table_schema}.{temp_table}_INCR
+    (
+        SELECT {join_key}, {table_details_mapping[y][0]}, {table_details_mapping[y][1]}, '{y}', 1,  CURRENT_TIMESTAMP
+        FROM {source_table_schema}.{base_table} {base_table}"""
+                    incr_join_text = ''
+                    for i in range(len(join_condition)):
+                        sql += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
+                        incr_join_text += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
+                    incr_join[y] = incr_join_text
+                    sql += f"""
+        WHERE COALESCE({join_tables[i+1]}.operation,'NA') <> 'D'
+        AND TO_TIMESTAMP( CAST(SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),1,4) || '-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),5,2) ||'-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),7,2) || ' ' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),9,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),11,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')  > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{y}')
+    );"""
+                    incr_query.append(sql)
+                    incr_tables.append(y)
+            else:
+                source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
+                join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')]
+                join_key_list = []
+                for x in join_keys:
+                    join_key_list.append(f'{base_table}.{x}')
+                join_key = ', '.join(join_key_list)
+                incr_cols = join_key
                 sql = f"""
+    INSERT INTO {temp_table_schema}.{temp_table}_INCR
+    (
+        SELECT {join_key}, {table_details_mapping[base_table][0]}, {table_details_mapping[base_table][1]}, '{base_table}', 1,  CURRENT_TIMESTAMP
+        FROM {source_table_schema}.{base_table} {base_table}
+        WHERE COALESCE(operation,'NA') <> 'D'
+        AND TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')  > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{base_table}')
+    );"""
                 proc_query.append(sql)
+                incr_tables.append(base_table)
+    proc_query.append('\n'.join(incr_query))
+    proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.INCR1_{temp_table};')
+    sql = f"""
+    INSERT INTO {temp_table_schema}.INCR1_{temp_table}
+    (
+        SELECT DISTINCT {incr_cols.replace(f'{base_table}.', '')}
+        FROM {temp_table_schema}.{temp_table}_INCR
+    );"""
+    proc_query.append(sql)
+    incr_table_dict = {}
+    for table in incr_tables:
+        if table == base_table:
+            incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
+        else:
+            p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
+            if len(p) == 1:
+                incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}'
             else:
+                incr_table_dict[table] = f'{source_table_schema}.{table}'
+    s = []
+    for table in incr_tables:
+        incr2_sql_list = []
+        if table == base_table:
+            for key in incr_cols.replace(f'{base_table}.', '').split(','):
+                incr2_sql_list.append(f"{base_table}.{key} = A.{key}")
+            incr2_sql_join = ' AND '.join(incr2_sql_list)
             sql = f"""
+    CREATE TABLE {temp_table_schema}.INCR2_{table}
+    AS
         SELECT
+            {table}.*
+        FROM
+            {source_table_schema}.{table} {table}
+        INNER JOIN
+            {temp_table_schema}.INCR1_{temp_table} A ON {incr2_sql_join}; """
+            proc_query.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};')
             proc_query.append(sql)
+        else:
+            p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
+            if len(p) == 1:
+                sql = f"""
+    CREATE TABLE {temp_table_schema}.INCR2_{table}
+    AS
+        SELECT
+            {table}.*
+        FROM
+            {temp_table_schema}.INCR2_{base_table} {base_table} {incr_join[table]};"""
+                s.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};')
+                s.append(sql)
+    for x in s:
+        proc_query.append(x)
+    select_clause = []
+    from_clause = []
+    where_clause = []
+    for _,row in df.iterrows():
+        field_name = row['Field_Name']
+        source_table = row['Source Table']
+        source_column = row['Source Column']
+        joining_keys = row['Joining Keys']
+        primary_key = row['Primary Key']
+        direct_derived = row['Direct/Derived']
+        join_type = row['Join Type']
+        join_tables = row['Join Tables']
+        join_condition = row['Join Condition']
+        column_operation = row['Column Operations']
+        alias = row['Alias']
+        granularity = row['Granularity']
+        filter_condition = row['Filter Condition']
+        clauses = row['Clauses']
+        ordering = row['Ordering']
+        if pd.notna(direct_derived):
+            if pd.notna(column_operation):
+                if len(column_operation.split()) == 1:
+                    select_expr = f'{column_operation.upper()}({source_table}.{source_column})'
+                else:
+                    select_expr = column_operation
+            else:
+                if pd.notna(source_table):
+                    select_expr = f'{source_table}.{source_column}'
+                else:
+                    select_expr = source_column
+            if source_column not in join_fields:
+                if pd.notna(alias):
+                    select_expr += f' AS {alias}'
+            else:
+                if pd.notna(column_operation) and pd.notna(source_column):
+                    select_expr += f' AS {source_column}'
+            if direct_derived.upper() == 'DIRECT':
+                select_clause.append(select_expr)
+            elif direct_derived.upper() == 'DERIVED_BASE':
+                select_clause.append(select_expr)
+            if pd.notna(filter_condition):
+                where_clause.append(filter_condition)
+    select_query = ',\n\t'.join(select_clause)
+    sql_query = f"CREATE TABLE {temp_table_schema}.{base_table}_BASE\nAS \n\tSELECT \n\t{select_query} \nFROM\n\t{incr_table_dict[base_table]} {base_table}"
+    if where_clause:
+        sql_query += f"\nWHERE {' AND'.join(where_clause)}"
+    sql_query += ';'
+    proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{base_table}_BASE;")
+    proc_query.append(sql_query)
+    df['Clauses'].fillna('', inplace = True)
+    df['Ordering'].fillna('', inplace = True)
+    c = 1
+    temp_base_table = f'{base_table}_BASE'
+    grp_cols = ['Join Condition', 'Clauses', 'Ordering']
+    join_grps = df[df['Direct/Derived'] == 'DERIVED'].groupby(['Join Condition', 'Clauses', 'Ordering'])
+    temp_tables_sql = []
+    for (join_condition,clauses,ordering), group in join_grps:
+        if pd.notna(group['Direct/Derived'].iloc[0]):
+            if group['Direct/Derived'].iloc[0].upper() == 'DERIVED':
+                join_tables = [t.strip() for t in group['Join Tables'].iloc[0].split(',')]
+                join_keys = [t.strip() for t in group['Joining Keys'].iloc[0].split(',')]
+                join_type = [t.strip() for t in group['Join Type'].iloc[0].split(',')]
+                join_condition = split_join_condition(group['Join Condition'].iloc[0])
+                temp_table_name = f"TEMP_{group['Source Table'].iloc[0]}"
+                source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
+                alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
+                source_table = [t.strip() for t in (','.join(group['Source Table'])).split(',')]
+                base_cols = []
+                for join_key in join_keys:
+                    base_cols.append(f'{join_tables[0]}.{join_key}')
+                for s_table,col,alias in zip(source_table,source_column,alias):
+                    if pd.notna(group['Column Operations'].iloc[0]):
+                        if len(group['Column Operations'].iloc[0].split()) == 1:
+                            select_expr = f"{group['Column Operations'].iloc[0].upper()}({s_table}.{col})"
+                        else:
+                            select_expr = group['Column Operations'].iloc[0]
+                    else:
+                        if pd.notna(s_table):
+                            select_expr = f"{s_table}.{col}"
+                        else:
+                            select_expr = col
+                    if alias:
+                        select_expr += f" AS {alias}"
+                        base_cols.append(select_expr)
+                if ordering:
+                    base_cols.append(f"{ordering} AS RN")
+                sql = ',\n\t\t'.join(base_cols)
+                join_sql = f"SELECT \n\t\t{sql} \nFROM\n\t{incr_table_dict[base_table]} {join_tables[0]}"
+                for i in range(len(join_type)):
+                    join_sql += f'\n\t{join_type[i]} JOIN {incr_table_dict[join_tables[i+1]]} {join_tables[i+1]} ON {join_condition[i]}'
+                if clauses:
+                    join_sql += f'\n\t{clauses}'
+                join_sql += ';'
+                proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{temp_table_name};")
+                proc_query.append(f"CREATE TABLE {temp_table_schema}.{temp_table_name}\nAS \n\t{join_sql}")
+                granularity = [t.strip() for t in group['Granularity'].iloc[0].split(',')]
+                sql = []
+                for key in join_keys:
+                    sql.append(f"A.{key} = B.{key}")
+                temp_cols = []
+                temp_cols.append('A.*')
+                source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')]
+                alias = [t.strip() for t in (','.join(group['Alias'])).split(',')]
+                for col,alias in zip(source_column,alias):
+                    select_expr = f"B.{col}"
+                    if alias:
+                        select_expr = f"B.{alias}"
+                    else:
+                        select_expr = f"B.{col}"
+                    temp_cols.append(select_expr)
+                temp_select_query = ',\n\t\t'.join(temp_cols)
+                proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.TEMP_{temp_table}_{c};")
+                base_sql = f"CREATE TABLE {temp_table_schema}.TEMP_{temp_table}_{c}\nAS \n\tSELECT \n\t\t{temp_select_query} \nFROM\n\t{temp_table_schema}.{temp_base_table} AS A"
+                base_sql += f"\n\tLEFT OUTER JOIN {temp_table_schema}.{temp_table_name} B ON {' AND '.join(sql)}"
+                if '1:1' in granularity and len(ordering) > 1:
+                    base_sql += f" AND B.RN = 1"
+                base_sql += ';'
+                temp_base_table = f'TEMP_{temp_table}_{c}'
+                c += 1
+                proc_query.append(base_sql)
+    fin_table_name = temp_table
+    fin_table_cols = []
+    for _,row in df.iterrows():
+        field_name = row['Field_Name']
+        source_table = row['Source Table']
+        source_column = row['Source Column']
+        alias = row['Alias']
+        if pd.notna(row['Direct/Derived']):
+            if (source_column in join_fields):
+                fin_table_cols.append(f'{source_column} AS "{field_name}"')
+            else:
+                fin_table_cols.append(f'"{field_name}"')
+    fin_table_cols = ',\n\t\t'.join(fin_table_cols)
+    fin_sql = f"INSERT INTO {temp_table_schema}.{fin_table_name}\n\tSELECT \n\t\t{fin_table_cols} \nFROM\n\t{temp_table_schema}.TEMP_{temp_table}_{c-1};"
+    condition_col = '_'.join(incr_cols.replace(f'{base_table}.', '').split(','))
+    proc_query.append(f"DELETE FROM {temp_table_schema}.{fin_table_name}\nWHERE {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} IN  (SELECT {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} FROM {temp_table_schema}.INCR1_{temp_table});")
+    proc_query.append(fin_sql)
+    for table in incr_tables:
+        sql = f"""
+    INSERT INTO audit.reportingdb_audit_tbl_{temp_table}
+    (
+    SELECT
+        '{temp_table}' as mart_table_name,
+        '{table}' as src_table_name,
+        coalesce( max(TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')),(select max(max_update_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) max_update_date,
+        CURRENT_TIMESTAMP as load_timestamp,
+        coalesce(max(prev_updt_ts),(select max(source_reference_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) AS source_reference_date,
+        max(nvl(batch_number,0))+1
+    FROM {temp_table_schema}.{temp_table}_INCR where table_name = '{table}'
+    );"""
+        proc_query.append(sql)
+    return base_table, base_pk, proc_query, incr_join_grps, incr_table_join_info, incr_join, temp_table_schema
+def create_df(query, table_df_mapping, table_usage_count):
+    script = []
+    query = ' '.join(query.split()).strip()
+    match = re.match(r'CREATE TABLE (\w+\.\w+\.\w+) AS (SELECT .+)', query, re.IGNORECASE)
+    source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
+    source_tables = [table for pair in source_tables for table in pair if table]
+    if not match:
+        raise ValueError('Invalid SQL')
+    table_name = match.group(1).split('.')[2]
+    select_statement = match.group(2)
+    create_script = f'{table_name} = spark.sql(""" {select_statement} """)'
+    persist_script = f'{table_name} = {table_name}.persist()'
+    view_script = f'{table_name}.createOrReplaceTempView("{table_name}")'
+    for table in source_tables:
+        create_script = create_script.replace(table, table_df_mapping[table])
+    script.append(f"\n\t\t######################---------Creating table {create_script.split('=')[0].strip()}-------############################")
+    script.append(create_script)
+    script.append(persist_script)
+    script.append(view_script)
+    script.append(f'''print("{create_script.split('=')[0].strip()} count: ", {create_script.split('=')[0].strip()}.count()''')
+    if 'INCR2_' in table_name:
+        x = table_name.split('INCR2_')[1]
+        if x in table_details_mapping.keys():
+            script.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################")
+            script.append(f"{x}_max_update_date = INCR2_{x}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]")
+            script.append(f"{x}_max_source_reference_date = INCR2_{x}.agg(max(to_timestamp('{table_details_mapping[x][1].replace(x+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]")
+            script.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{x}',{x}_max_update_date,{x}_max_source_reference_date, max_batch_id, config)")
+    script.append('\n')
+    for table in source_tables:
+        table_usage_count[table.split('.')[2]] -= 1
+    for table in source_tables:
+        if table_usage_count[table.split('.')[2]] == 0 and 'INCR1_' not in table:
+            unpersist_script = f"{table.split('.')[2]}.unpersist()"
+            script.append(unpersist_script)
+    return '\n\t\t'.join(script)
+def generate_spark(proc_query, incr_join_grps, base_table, base_pk, incr_table_join_info, incr_join, temp_table_schema):
+    table_usage_count =  defaultdict(int)
+    table_df_mapping = {}
+    for query in proc_query:
+        if 'CREATE TABLE' or 'DELETE' in query:
+            source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
+            source_tables = [table for pair in source_tables for table in pair if table]
+            for table in source_tables:
+                table_usage_count[table.split('.')[2]] += 1
+                if 'DELETE' not in query:
+                    table_df_mapping[table] = table.split('.')[2]
+    script = []
+    for query in proc_query:
+        if 'CREATE TABLE' in query:
+            script.append(create_df(query, table_df_mapping,table_usage_count))
+    spark_query = []
+    spark_query.append("\t\t######################---------Reading source data -------############################")
+    for table in table_details_mapping.keys():
+        spark_query.append(f'{table} = read_file(spark, config, \"{table}\").filter("{table_details_mapping[table][2]}")')
+        spark_query.append(f'{table} = {table}.persist()')
+        spark_query.append(f'{table}.createOrReplaceTempView("{table}")')
+        spark_query.append(f'print("{table} count: ", {table}.count()')
+        spark_query.append('\n')
+    spark_query.append("\n\t\t######################---------Reading records-------############################")
+    for table in table_details_mapping.keys():
+        spark_query.append(f"{table}_max_update_date = read_max_update_date(redshift_conn, config['application_name'],'{table}', config)")
+        spark_query.append(f'{table}_max_update_date = {table}_max_update_date[0][0]')
+        spark_query.append('\n')
+    incr1_spark = []
+    temp_incr1 = []
+    for _, group in incr_join_grps:
+        for table in _.split():
+            if base_table != table:
+                join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')]
+                join_keys = [t.strip() for t in ','.join(base_pk).split(',')]
+                join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')]
+                join_cond = split_join_condition(incr_table_join_info[table])
+                join_condition = join_incr(join_cond)
+                source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
+                join_key_list = []
+                for x in join_keys:
+                    join_key_list.append(f'{base_table}.{x}')
+                join_key = ', '.join(join_key_list)
+                for y in source_table:
+                    sql = f"""SELECT {join_key} FROM {base_table} {base_table}"""
+                    incr_join_text = ''
+                    i=0
+                    for i in range(len(join_condition)):
+                        sql += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
+                        incr_join_text += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}'
+                    sql += f''' WHERE {join_tables[i+1]}._hoodie_commit_time > cast('"""+str({join_tables[i+1]}_max_update_date)+"""' as timestamp)'''
+                    temp_incr1.append(sql)
+            else:
+                source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')]
+                join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')]
+                join_key_list = []
+                for x in join_keys:
+                    join_key_list.append(f'{base_table}.{x}')
+                join_key = ', '.join(join_key_list)
+                sql = f'''SELECT {join_key} FROM {base_table} {base_table} WHERE {base_table}._hoodie_commit_time > cast('"""+str({base_table}_max_update_date)+"""' as timestamp)'''
+                incr1_spark.append(sql)
+    for i in temp_incr1:
+        incr1_spark.append(i)
+    incr1_spark = '\nUNION\n'.join(incr1_spark)
+    spark_query.append("\n\t\t######################---------Creating INCR1-------############################")
+    spark_query.append(f'INCR1_{temp_table} = spark.sql(""" {incr1_spark} """)')
+    spark_query.append(f'\n\t\tINCR1_{temp_table} = INCR1_{temp_table}.dropDuplicates()')
+    spark_query.append(f'INCR1_{temp_table} = INCR1_{temp_table}.persist()')
+    spark_query.append(f'INCR1_{temp_table}.createOrReplaceTempView("INCR1_{temp_table}")')
+    spark_query.append(f'print("INCR1_{temp_table} count: ", INCR1_{temp_table}.count())')
+    spark_query.append("\n\t\t######################---------Creating INCR2-------############################")
+    for table in table_details_mapping.keys():
+        if table in incr_join.keys():
+            p = [x for x in incr_join[table].split('\n\t') if len(x) > 1]
+            if len(p) > 1:
+                spark_query.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################")
+                spark_query.append(f"{table}_max_update_date = {table}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]")
+                spark_query.append(f"{table}_max_source_reference_date = {table}.agg(max(to_timestamp('{table_details_mapping[table][1].replace(table+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]")
+                spark_query.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{table}',{table}_max_update_date,{table}_max_source_reference_date, max_batch_id, config)")
+                spark_query.append('\n')
+    for query in script:
+        spark_query.append(query)
+        spark_query.append('\n')
+    spark_query1 = []
+    spark_query1.append('\n')
+    for query in proc_query:
+        if f'{temp_table_schema}.{temp_table}\n' in query:
+            final_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE)
+            final_tables = [table.split('.')[2].strip() for pair in final_tables for table in pair if table and table.split('.')[2].strip() != temp_table][0]
+            if 'INCR1_' in final_tables:
+                spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['incr2df_path'])")
+            else:
+                spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['resultdf_path'])")
+            spark_query1.append(f'''cur.execute(""" {query} """)''')
+            spark_query1.append('\n')
+    with open('template.txt') as file:
+        template = file.read()
+    result = template.replace('INSERT_CODE_1', '\n\t\t'.join(spark_query))
+    result = result.replace('INSERT_CODE_2', '\t\t'.join(spark_query1))
+    return result
+# st.set_page_config(page_title='AUTOMATED SOURCE TO TARGET MAPPING', layout= 'wide')
+# st.markdown("""
+#     <style>
+#            /* Remove blank space at top and bottom */
+#            .block-container {
+#                padding-top: 1.9rem;
+#                padding-bottom: 1rem;
+#             }
+#            /* Remove blank space at the center canvas */
+#            .st-emotion-cache-z5fcl4 {
+#                position: relative;
+#                top: -62px;
+#                }
+#            /* Make the toolbar transparent and the content below it clickable */
+#            .st-emotion-cache-18ni7ap {
+#                pointer-events: none;
+#                background: rgb(255 255 255 / 0%)
+#                }
+#            .st-emotion-cache-zq5wmm {
+#                pointer-events: auto;
+#                background: rgb(255 255 255);
+#                border-radius: 5px;
+#                }
+#     </style>
+#     """, unsafe_allow_html=True)
+######
+def main():
+    # st.title('PAGE TITLE')  # Change this for each page
+    sidebar()
+########
     st.subheader('AUTOMATED SOURCE TO TARGET MAPPING')
     mode= st.selectbox('Select Mode of Mapping',('Supervised Mapping(You Have Sufficient Sample Data in Target Template)', 'Unsupervised Mapping(You Do Not Have Sufficient Sample Data in Target Template)'), index=None,placeholder='Select category of table')
     if mode == 'Supervised Mapping(You Have Sufficient Sample Data in Target Template)':