rahgadda commited on
Commit
89a1200
·
1 Parent(s): 81edf33

Initial Draft

Browse files
Files changed (1) hide show
  1. pages/3_Mapping.py +189 -0
pages/3_Mapping.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import streamlit as st
4
+ from streamlit_extras.switch_page_button import switch_page
5
+ import pandas as pd
6
+ import numpy as np
7
+ import torch
8
+ import faiss
9
+ from sentence_transformers import SentenceTransformer
10
+ import csv
11
+
12
+ ################################
13
+ ######### Variables ############
14
+ ################################
15
+ # -- Loading Variables
16
+ script_directory = os.path.dirname(os.path.abspath(sys.argv[0]))
17
+ source_df = pd.DataFrame()
18
+ destination_df = pd.DataFrame()
19
+ model = SentenceTransformer('all-mpnet-base-v2')
20
+
21
+ # -- Loading Session Data
22
+ if 'project_data' not in st.session_state:
23
+ st.session_state.project_data = pd.read_csv(script_directory+'/data/project.csv')
24
+
25
+ ################################
26
+ ####### GenericFunctions #######
27
+ ################################
28
+ # -- Create Embedding - all-mpnet-base-v2 - https://www.sbert.net/docs/pretrained_models.html
29
+ def embed_text(text):
30
+ embedding = model.encode(text)
31
+ return embedding
32
+
33
+ def embed_list(list):
34
+ embeddings = []
35
+ for text in list:
36
+ embeddings.append(embed_text(text))
37
+ return embeddings
38
+
39
+ # -- Store embeddings in a FAISS Vector database
40
+ def store_embeddings(embeddings):
41
+ dimension = embeddings[0].shape[0]
42
+ index = faiss.IndexFlatIP(dimension)
43
+ index.add(np.array(embeddings))
44
+ # faiss.write_index(index, "data/vector_db.index")
45
+ return index
46
+
47
+ # -- Perform semantic search using embeddings
48
+ def semantic_search(query_embedding, index, k=1):
49
+ D, I = index.search(np.array([query_embedding]), k)
50
+ return I[0][0]
51
+
52
+ ################################
53
+ ####### Display of data ########
54
+ ################################
55
+ # -- Streamlit Settings
56
+ st.set_page_config(layout='wide')
57
+ st.title("Mapping")
58
+
59
+ # -- Add Project Dropdown
60
+ st.text("")
61
+ st.text("")
62
+ st.text("")
63
+ col1, col2, col3 = st.columns(3)
64
+ option = col1.selectbox('Select Project',st.session_state.project_data['Project'])
65
+ col1, col2, col3 = st.columns(3)
66
+
67
+
68
+ # -- Destination File Name
69
+ st.text("")
70
+ st.text("")
71
+
72
+ col1, col2, col3 = st.columns(3)
73
+ cond = (st.session_state.project_data['Project'] == option)
74
+ result = st.session_state.project_data[cond].Destination.values[0]
75
+ with col1:
76
+ destination_file_format = st.file_uploader(
77
+ "Destination file name - "+str(result)+".csv",
78
+ type="csv",
79
+ key="destination_file_format",
80
+ accept_multiple_files=True
81
+ )
82
+
83
+ if destination_file_format is not None:
84
+ for file in destination_file_format:
85
+ destination_df = pd.read_csv(file)
86
+
87
+ # -- Source File Name
88
+ cond = (st.session_state.project_data['Project'] == option)
89
+ result = st.session_state.project_data[cond].Source.values[0]
90
+ with col3:
91
+ source_file_format = st.file_uploader(
92
+ "Source file name - "+str(result)+".csv",
93
+ type="csv",
94
+ key="source_file_format",
95
+ accept_multiple_files=True
96
+ )
97
+
98
+ if source_file_format is not None:
99
+ for file in source_file_format:
100
+ source_df = pd.read_csv(file)
101
+
102
+ # -- Suggest Button
103
+ st.text("")
104
+ st.text("")
105
+ col1, col2, col3 = st.columns([0.25,0.2,2.55])
106
+ if col1.button("Suggest"):
107
+ st.session_state.mapping_df = pd.DataFrame(columns=["Sno","DestinationColumn","SourceColumn","Type","Expression"])
108
+ if len(destination_df) == 0 or len(source_df) == 0:
109
+ st.error("Select Source and Destination Files")
110
+ else:
111
+ new_data = []
112
+
113
+ # Source - KnowledgeBase
114
+ input_text = source_df["Columns"].tolist()
115
+ embeddings = embed_list(input_text)
116
+ index = store_embeddings(embeddings)
117
+
118
+ # Map to Source
119
+ for i in range(len(destination_df)):
120
+ search_text = destination_df.loc[i, "Columns"]
121
+ query_embeddings = embed_text(search_text)
122
+ result = input_text[semantic_search(query_embeddings, index)]
123
+ row = {
124
+ "Sno": i+1,
125
+ "DestinationColumn": destination_df.loc[i, "Columns"],
126
+ "SourceColumn": result,
127
+ "Type": None,
128
+ "Expression":None
129
+ }
130
+ new_data.append(row)
131
+
132
+ # Saving Mapping and displaying
133
+ st.session_state.mapping_df = pd.concat(
134
+ [ st.session_state.mapping_df, pd.DataFrame(new_data)],
135
+ ignore_index=True
136
+ )
137
+
138
+ # -- Save Button
139
+ if col2.button("Save"):
140
+ if (len(destination_df) > 0 and len(source_df) > 0 and len(st.session_state.mapping_df)>0):
141
+ cond = (st.session_state.project_data['Project'] == option)
142
+ file_name = script_directory+'/data/'+str(st.session_state.project_data[cond].Id.values[0])+"_"+st.session_state.project_data[cond].Source.values[0]+"_"+st.session_state.project_data[cond].Destination.values[0]+'.csv'
143
+ st.session_state.mapping_df.to_csv(file_name, index=False, sep="|",quoting=csv.QUOTE_NONE)
144
+ else:
145
+ st.error("Transformation not created")
146
+
147
+ # -- Load Exisitng Mapping
148
+ if col3.button("Load Mapping"):
149
+ cond = (st.session_state.project_data['Project'] == option)
150
+ file_name = script_directory+'/data/'+str(st.session_state.project_data[cond].Id.values[0])+"_"+st.session_state.project_data[cond].Source.values[0]+"_"+st.session_state.project_data[cond].Destination.values[0]+'.csv'
151
+ st.session_state.mapping_df = pd.read_csv(file_name,sep="|",quoting=csv.QUOTE_NONE)
152
+
153
+ # -- Display Mapping Table
154
+ if (len(destination_df) > 0 and len(source_df) > 0 and len(st.session_state.mapping_df)>0):
155
+ st.text("")
156
+ st.header("Mapping Details")
157
+ st.text("")
158
+ st.text("")
159
+ st.session_state.mapping_df = st.data_editor(
160
+ st.session_state.mapping_df,
161
+ height=400,
162
+ width=1200,
163
+ hide_index=True,
164
+ column_config={
165
+ "Sno": st.column_config.TextColumn(
166
+ "Sno"
167
+ ),
168
+ "DestinationColumn": st.column_config.TextColumn(
169
+ "DestinationColumn"
170
+ ),
171
+ "SourceColumn": st.column_config.SelectboxColumn(
172
+ "SourceColumn",
173
+ width="medium",
174
+ options= source_df["Columns"],
175
+ ),
176
+ "Type": st.column_config.SelectboxColumn(
177
+ "Type",
178
+ width="medium",
179
+ options=[
180
+ "Pandas",
181
+ "Constant"
182
+ ]
183
+ ),
184
+ "Expression": st.column_config.TextColumn(
185
+ "Expression"
186
+ )
187
+ },
188
+ disabled=["Sno","DestinationColumn"]
189
+ )