1MR commited on
Commit
27b7701
·
verified ·
1 Parent(s): 55ed2e6

Upload 8 files

Browse files
Files changed (8) hide show
  1. Information.py +61 -0
  2. Main.py +122 -0
  3. Preprocessing1.py +145 -0
  4. Preprocessing2.py +217 -0
  5. RAG.py +222 -0
  6. Rag.txt +28 -0
  7. Virtualization.py +75 -0
  8. tempCodeRunnerFile.py +1 -0
Information.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+
10
+
11
+
12
+
13
+ def show_general_data_statistics():
14
+ if "data" in st.session_state:
15
+ data = st.session_state["data"]
16
+ num_var = len(data.columns)
17
+ num_rows = len(data)
18
+ missing_cells = data.isnull().sum().sum()
19
+ missing_cells_percent = (missing_cells / (data.size)) * 100
20
+ duplicate_rows = data.duplicated().sum()
21
+ duplicate_rows_percent = (duplicate_rows / num_rows) * 100
22
+ var_types = data.dtypes.value_counts()
23
+
24
+ st.write("### General Data Statistics:")
25
+ st.write(f"- **Number of Variables:** {num_var}")
26
+ st.write(f"- **Number of Rows:** {num_rows}")
27
+ st.write(f"- **Missing Cells:** {missing_cells}")
28
+ st.write(f"- **Missing Cells (%):** {missing_cells_percent:.2f}%")
29
+ st.write(f"- **Duplicate Rows:** {duplicate_rows}")
30
+ st.write(f"- **Duplicate Rows (%):** {duplicate_rows_percent:.2f}%")
31
+ st.write("#### Variable Types:")
32
+ st.write(var_types)
33
+ else:
34
+ st.warning("Please upload a dataset first.")
35
+
36
+
37
+
38
+
39
+ def describe_data():
40
+ st.title("Describe Data")
41
+
42
+ if "data" in st.session_state:
43
+ data = st.session_state["data"]
44
+ st.write("Dataset Description:")
45
+ st.write(data.describe())
46
+ else:
47
+ st.warning("Please upload a dataset first.")
48
+
49
+
50
+ def info_data():
51
+ st.title("Dataset Info")
52
+
53
+ if "data" in st.session_state:
54
+ data = st.session_state["data"]
55
+ buffer = io.StringIO()
56
+ data.info(buf=buffer)
57
+ info = buffer.getvalue()
58
+ st.text(info)
59
+ else:
60
+ st.warning("Please upload a dataset first.")
61
+
Main.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+ import json
10
+ from langchain.docstore.document import Document
11
+ from langchain.vectorstores import Chroma
12
+ from langchain.embeddings import HuggingFaceEmbeddings
13
+ from langchain.llms import HuggingFaceHub
14
+ from langchain.chains import RetrievalQA
15
+ from Information import show_general_data_statistics, describe_data, info_data
16
+ from Preprocessing1 import preview_data, data_cleaning, modify_column_names
17
+ from Preprocessing2 import handle_categorical_values, missing_values, handle_duplicates, handle_outliers
18
+ from Virtualization import visualize_data
19
+
20
+
21
+ def upload_data():
22
+ st.title("Upload Dataset")
23
+ file = st.file_uploader("Upload your dataset", type=[
24
+ "csv", "xlsx"], key="file_uploader_1")
25
+
26
+ if file:
27
+ try:
28
+ if file.name.endswith(".csv"):
29
+ data = pd.read_csv(file)
30
+ elif file.name.endswith(".xlsx"):
31
+ data = pd.read_excel(file)
32
+
33
+ st.session_state["data"] = data
34
+ st.success("Dataset uploaded successfully!")
35
+ except Exception as e:
36
+ st.error(f"Error loading file: {e}")
37
+ return file
38
+
39
+
40
+ def download_data():
41
+ """Downloads the DataFrame as a CSV file."""
42
+ if "data" in st.session_state and not st.session_state["data"].empty:
43
+ csv = st.session_state["data"].to_csv(index=False).encode('utf-8')
44
+
45
+ download_button = st.download_button(
46
+ label="Download Cleaned Dataset",
47
+ data=csv,
48
+ file_name="cleaned_data.csv",
49
+ mime="text/csv"
50
+ )
51
+
52
+ if download_button:
53
+ st.balloons()
54
+ st.success("Dataset is ready for download!")
55
+
56
+ else:
57
+ st.warning(
58
+ "No data available to download. Please modify or upload a dataset first.")
59
+
60
+
61
+ def rag_chatbot():
62
+ pass
63
+
64
+
65
+ def main():
66
+ st.sidebar.title("Navigation")
67
+ options = st.sidebar.radio(
68
+ "Go to",
69
+ [
70
+ "Upload",
71
+ "Preview",
72
+ "Data Cleaning",
73
+ "Modify Column Names",
74
+ "General Data Statistics",
75
+ "Describe",
76
+ "Info",
77
+ "Handle Categorical",
78
+ "Missing Values",
79
+ "Handle Duplicates",
80
+ "Handle Outliers",
81
+ "Visualize Data",
82
+ "Download",
83
+ "RAG Chatbot"
84
+ ],
85
+ key="unique_navigation_key",
86
+ )
87
+
88
+ if options == "Upload":
89
+ upload_data()
90
+ elif options == "Preview":
91
+ preview_data()
92
+ elif options == "Data Cleaning":
93
+ data_cleaning()
94
+ elif options == "Modify Column Names":
95
+ modify_column_names()
96
+ elif options == "General Data Statistics":
97
+ show_general_data_statistics()
98
+ elif options == "Describe":
99
+ describe_data()
100
+ elif options == "Info":
101
+ info_data()
102
+ elif options == "Handle Categorical":
103
+ handle_categorical_values()
104
+ elif options == "Missing Values":
105
+ missing_values()
106
+ elif options == "Handle Duplicates":
107
+ handle_duplicates()
108
+ elif options == "Handle Outliers":
109
+ handle_outliers()
110
+ elif options == "Visualize Data":
111
+ visualize_data()
112
+ elif options == "Download":
113
+ download_data()
114
+ elif options == "RAG Chatbot":
115
+ rag_chatbot()
116
+
117
+ else:
118
+ st.warning("Please upload a dataset first.")
119
+
120
+
121
+ if __name__ == "__main__":
122
+ main()
Preprocessing1.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+
10
+
11
+ def preview_data():
12
+ if "data" in st.session_state:
13
+ data = st.session_state["data"]
14
+
15
+ st.write("### Dataset Preview Options:")
16
+
17
+ preview_option = st.radio(
18
+ "Select how to preview the dataset:",
19
+ options=["Head", "Tail", "Custom Number of Rows"],
20
+ index=0
21
+ )
22
+
23
+ if preview_option == "Head":
24
+ st.write("### First 5 Rows of the Dataset:")
25
+ st.dataframe(data.head())
26
+ elif preview_option == "Tail":
27
+ st.write("### Last 5 Rows of the Dataset:")
28
+ st.dataframe(data.tail())
29
+ elif preview_option == "Custom Number of Rows":
30
+ number = st.slider(
31
+ "Select Number of Rows to Display:", 1, len(data))
32
+ st.write(f"### First {number} Rows of the Dataset:")
33
+ st.dataframe(data.head(number))
34
+
35
+ # Show entire data
36
+ if st.checkbox("Show all data"):
37
+ st.write(data)
38
+
39
+ # Show column names
40
+ if st.checkbox("Show Column Names"):
41
+ st.write(data.columns)
42
+
43
+ # Show dataset dimensions (rows and columns)
44
+ if st.checkbox("Show Dimensions"):
45
+ st.write(data.shape)
46
+
47
+ else:
48
+ st.warning("Please upload a dataset to view options.")
49
+
50
+
51
+ def data_cleaning():
52
+ if "data" in st.session_state:
53
+ data = st.session_state["data"]
54
+
55
+ st.subheader("Data Cleaning")
56
+
57
+ col_option = st.selectbox("Choose your option", [
58
+ "Check all numeric features are numeric?", "Show unique values of categorical features"])
59
+
60
+ # Check and convert numeric columns
61
+ if col_option == "Check all numeric features are numeric?":
62
+ st.write("Converting all numeric columns to numeric types...")
63
+ numeric_columns = list(
64
+ data.select_dtypes(include=np.number).columns)
65
+ for col in numeric_columns:
66
+ data[col] = pd.to_numeric(data[col], errors='coerce')
67
+
68
+ st.success("Done!")
69
+
70
+ # Show unique values for categorical features
71
+ elif col_option == "Show unique values of categorical features":
72
+ st.write("Unique values for categorical features:")
73
+ for column in data.columns:
74
+ # check for categorical features (strings)
75
+ if data[column].dtype == object:
76
+ st.write(f"{column}: {data[column].unique()}")
77
+ st.write("====================================")
78
+
79
+ else:
80
+ st.warning("Please upload a dataset to perform data cleaning.")
81
+
82
+
83
+ def modify_column_names():
84
+ st.title("Modify Column Names")
85
+
86
+ # Ensure data exists in the session
87
+ if "data" in st.session_state:
88
+ df = st.session_state["data"]
89
+
90
+ # Ensure modified_columns is initialized in session state
91
+ if "modified_columns" not in st.session_state:
92
+ st.session_state.modified_columns = list(df.columns)
93
+
94
+ st.write('### *Current Column Names*')
95
+ st.table(df.columns)
96
+
97
+ st.write('### *Modify Column Names*')
98
+ with st.expander("Modify Column Names", expanded=True):
99
+ # Use the modified columns from session state
100
+ before_col = st.session_state.modified_columns
101
+ before_col_df = pd.DataFrame(before_col, columns=['Column Name'])
102
+ st.table(before_col_df)
103
+
104
+ col3, col4, col5, col6 = st.columns(4)
105
+ changes_made = False # Flag to track if any change is made
106
+
107
+ if st.button('Convert to Uppercase'):
108
+ st.session_state.modified_columns = [
109
+ col.upper() for col in before_col]
110
+ changes_made = True
111
+ if st.button('Convert to Lowercase'):
112
+ st.session_state.modified_columns = [
113
+ col.lower() for col in before_col]
114
+ changes_made = True
115
+ if st.button('Replace Spaces with Underscore'):
116
+ st.session_state.modified_columns = [
117
+ col.replace(" ", "_") for col in before_col]
118
+ changes_made = True
119
+ if st.button('Capitalize First Letters'):
120
+ st.session_state.modified_columns = [
121
+ col.title() for col in before_col]
122
+ changes_made = True
123
+
124
+ # Apply the changes only if a change was made
125
+ if changes_made:
126
+ df.columns = st.session_state.modified_columns
127
+ st.success("Changes applied successfully.")
128
+ st.table(pd.DataFrame(
129
+ df.columns, columns=['Modified Columns']))
130
+
131
+ st.write("### *Modify a Specific Column Name*")
132
+ column_select = st.selectbox(
133
+ 'Select column to modify', options=st.session_state.modified_columns)
134
+ new_column_name = st.text_input('Enter new column name')
135
+ if st.button('Update Column Name'):
136
+ if column_select and new_column_name:
137
+ st.session_state.modified_columns = [
138
+ new_column_name if col == column_select else col for col in st.session_state.modified_columns]
139
+ df.columns = st.session_state.modified_columns
140
+ st.success("Column name updated.")
141
+ st.table(pd.DataFrame(
142
+ df.columns, columns=['Modified Columns']))
143
+
144
+ else:
145
+ st.warning("Please upload a dataset first.")
Preprocessing2.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+
10
+
11
+ def handle_categorical_values():
12
+ if "data" in st.session_state:
13
+ data = st.session_state["data"]
14
+
15
+ st.subheader("Handle Categorical Values")
16
+
17
+ categorical_cols_features = list(
18
+ data.select_dtypes(include="object").columns)
19
+
20
+ # One-Hot Encoding for nominal categorical features
21
+ one_hot_enc = st.multiselect(
22
+ "Select nominal categorical columns", categorical_cols_features)
23
+
24
+ # Apply one-hot encoding to selected columns
25
+ if one_hot_enc:
26
+ for column in one_hot_enc:
27
+ if data[column].dtype == 'object': # Only apply to categorical/string columns
28
+ data = pd.get_dummies(data, columns=[column])
29
+ st.write("### Data after One-Hot Encoding:")
30
+ st.write(data.head())
31
+
32
+ # Label Encoding for ordinal categorical features
33
+ label_encoder = LabelEncoder()
34
+ label_enc = st.multiselect(
35
+ "Select ordinal categorical columns", categorical_cols_features)
36
+
37
+ # Apply label encoding to selected columns
38
+ if label_enc:
39
+ for column in label_enc:
40
+ if data[column].dtype == 'object': # Only apply to categorical/string columns
41
+ data[column] = label_encoder.fit_transform(data[column])
42
+ st.write("### Data after Label Encoding:")
43
+ st.write(data.head())
44
+
45
+ else:
46
+ st.warning("Please upload a dataset to handle categorical values.")
47
+
48
+
49
+ def missing_values():
50
+ st.title("Handle Missing Values")
51
+
52
+ if "data" in st.session_state:
53
+ data = st.session_state["data"].copy()
54
+
55
+ action = st.selectbox(
56
+ "Select Action", ["Drop", "Dropna", "Fill missing val"])
57
+
58
+ column = st.selectbox("Select Column", data.columns)
59
+
60
+ st.write("### Before:")
61
+ st.dataframe(data)
62
+
63
+ modified_data = data.copy()
64
+
65
+ if action == "Drop":
66
+ modified_data.drop(columns=[column], inplace=True)
67
+ elif action == "Dropna":
68
+ modified_data.dropna(subset=[column], inplace=True)
69
+ elif action == "Fill missing val":
70
+ fill_method = st.selectbox(
71
+ "Select fill method", ["Mean", "Mode", "Median"])
72
+
73
+ if fill_method == "Mean":
74
+ fill_value = data[column].mean()
75
+ elif fill_method == "Mode":
76
+ fill_value = data[column].mode()[0]
77
+ elif fill_method == "Median":
78
+ fill_value = data[column].median()
79
+
80
+ modified_data[column].fillna(fill_value, inplace=True)
81
+
82
+ st.write("### After (Preview):")
83
+ st.dataframe(modified_data)
84
+
85
+ if st.button("OK"):
86
+ st.session_state["data"] = modified_data
87
+ st.success("Done! The action has been applied.")
88
+ st.write("### After:")
89
+ st.dataframe(modified_data)
90
+
91
+ else:
92
+ st.warning("Please upload a dataset first.")
93
+
94
+
95
+ def handle_duplicates():
96
+ st.title("Handle Duplicates")
97
+
98
+ if "data" in st.session_state:
99
+ data = st.session_state["data"].copy()
100
+
101
+ action = st.selectbox(
102
+ "Select Action", ["Drop Duplicates", "Drop Duplicates in Column", "Keep First", "Keep Last"])
103
+
104
+ if action in ["Drop Duplicates in Column", "Keep First", "Keep Last"]:
105
+ column = st.selectbox("Select Column", data.columns)
106
+ else:
107
+ column = None
108
+
109
+ st.write("### Before:")
110
+ st.dataframe(data)
111
+
112
+ after_placeholder = st.empty()
113
+
114
+ modified_data = data.copy()
115
+
116
+ if action == "Drop Duplicates":
117
+ modified_data.drop_duplicates(inplace=True)
118
+ elif action == "Drop Duplicates in Column":
119
+ modified_data.drop_duplicates(subset=[column], inplace=True)
120
+ elif action == "Keep First":
121
+ modified_data.drop_duplicates(
122
+ subset=[column], keep="first", inplace=True)
123
+ elif action == "Keep Last":
124
+ modified_data.drop_duplicates(
125
+ subset=[column], keep="last", inplace=True)
126
+
127
+ st.write("### After (Preview):")
128
+ st.dataframe(modified_data)
129
+
130
+ if st.button("OK"):
131
+ st.session_state["data"] = modified_data
132
+ st.success("Done! The action has been applied.")
133
+ st.write("### After:")
134
+ st.dataframe(modified_data)
135
+
136
+ else:
137
+ st.warning("Please upload a dataset first.")
138
+
139
+
140
+ def handle_outliers():
141
+ st.title("Handle Outliers")
142
+
143
+ if "data" in st.session_state:
144
+ data = st.session_state["data"].copy()
145
+
146
+ column = st.selectbox("Select Column", data.select_dtypes(
147
+ include=[np.number]).columns)
148
+
149
+ action = st.selectbox(
150
+ "Select Action",
151
+ ["Remove Outliers (IQR)", "Set Bounds Manually",
152
+ "Replace Outliers"]
153
+ )
154
+
155
+ st.write("### Before:")
156
+ st.dataframe(data)
157
+
158
+ after_placeholder = st.empty()
159
+
160
+ modified_data = data.copy()
161
+
162
+ if action == "Remove Outliers (IQR)":
163
+ Q1 = data[column].quantile(0.25)
164
+ Q3 = data[column].quantile(0.75)
165
+ IQR = Q3 - Q1
166
+ lower_bound = Q1 - 1.5 * IQR
167
+ upper_bound = Q3 + 1.5 * IQR
168
+
169
+ # Remove outliers
170
+ modified_data = modified_data[(
171
+ modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
172
+
173
+ elif action == "Set Bounds Manually":
174
+ # User inputs for bounds
175
+ lower_bound = st.number_input(
176
+ f"Set lower bound for {column}", value=float(data[column].min()))
177
+ upper_bound = st.number_input(
178
+ f"Set upper bound for {column}", value=float(data[column].max()))
179
+
180
+ modified_data = modified_data[(
181
+ modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
182
+
183
+ elif action == "Replace Outliers":
184
+
185
+ Q1 = data[column].quantile(0.25)
186
+ Q3 = data[column].quantile(0.75)
187
+ IQR = Q3 - Q1
188
+ lower_bound = Q1 - 1.5 * IQR
189
+ upper_bound = Q3 + 1.5 * IQR
190
+
191
+ replace_method = st.radio(
192
+ "Select Replacement Method",
193
+ ["Mean", "Median"]
194
+ )
195
+
196
+ if replace_method == "Mean":
197
+ replacement_value = data[column].mean()
198
+ else:
199
+ replacement_value = data[column].median()
200
+
201
+ # Replace outliers
202
+ modified_data[column] = modified_data[column].apply(
203
+ lambda x: replacement_value if x < lower_bound or x > upper_bound else x
204
+ )
205
+
206
+ # After Visualization
207
+ st.write("### After (Preview):")
208
+ st.dataframe(modified_data)
209
+
210
+ if st.button("OK"):
211
+ st.session_state["data"] = modified_data
212
+ st.success("Done! The action has been applied.")
213
+ st.write("### After:")
214
+ st.dataframe(modified_data)
215
+
216
+ else:
217
+ st.warning("Please upload a dataset first.")
RAG.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from langchain.document_loaders import DirectoryLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.llms import Ollama
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
8
+ from langchain_core.runnables.history import RunnableWithMessageHistory
9
+ from langchain_community.chat_message_histories import ChatMessageHistory
10
+ from langchain.chains import create_retrieval_chain
11
+ from langchain.chains.combine_documents import create_stuff_documents_chain
12
+ from sentence_transformers import SentenceTransformer, util
13
+ from langchain.schema import Document
14
+ from langchain_core.chat_history import BaseChatMessageHistory
15
+ from langchain.chains import create_history_aware_retriever
16
+ from langchain_huggingface import HuggingFaceEmbeddings
17
+
18
+ bot_template = '''
19
+ <div style="display: flex; align-items: center; margin-bottom: 10px; background-color: #B22222; padding: 10px; border-radius: 10px; border: 1px solid #7A0000;">
20
+ <div style="flex-shrink: 0; margin-right: 10px;">
21
+ <img src="https://raw.githubusercontent.com/AalaaAyman24/Test/main/chatbot.png"
22
+ style="max-height: 50px; max-width: 50px; object-fit: cover;">
23
+ </div>
24
+ <div style="background-color: #B22222; color: white; padding: 10px; border-radius: 10px; max-width: 75%; word-wrap: break-word; overflow-wrap: break-word;">
25
+ {msg}
26
+ </div>
27
+ </div>
28
+ '''
29
+
30
+
31
+ user_template = '''
32
+ <div style="display: flex; align-items: center; margin-bottom: 10px; justify-content: flex-end;">
33
+ <div style="flex-shrink: 0; margin-left: 10px;">
34
+ <img src="https://raw.githubusercontent.com/AalaaAyman24/Test/main/question.png"
35
+ style="max-height: 50px; max-width: 50px; border-radius: 50%; object-fit: cover;">
36
+ </div>
37
+ <div style="background-color: #757882; color: white; padding: 10px; border-radius: 10px; max-width: 75%; word-wrap: break-word; overflow-wrap: break-word;">
38
+ {msg}
39
+ </div>
40
+ </div>
41
+ '''
42
+
43
+ button_style = """
44
+ <style>
45
+ .small-button {
46
+ display: inline-block;
47
+ padding: 5px 10px;
48
+ font-size: 12px;
49
+ color: white;
50
+ background-color: #007bff;
51
+ border: none;
52
+ border-radius: 5px;
53
+ cursor: pointer;
54
+ margin-right: 5px;
55
+ }
56
+ .small-button:hover {
57
+ background-color: #0056b3;
58
+ }
59
+ .chat-box {
60
+ position: fixed;
61
+ bottom: 20px;
62
+ width: 100%;
63
+ left: 0;
64
+ padding: 20px;
65
+ background-color: #f1f1f1;
66
+ border-radius: 10px;
67
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
68
+ }
69
+ </style>
70
+ """
71
+
72
+ # Function to prepare and split documents from CSV or Excel
73
+
74
+
75
+ def prepare_and_split_docs(files):
76
+ split_docs = []
77
+ for file in files:
78
+ # Read the file with pandas based on the extension
79
+ if file.name.endswith('.csv'):
80
+ df = pd.read_csv(file)
81
+ elif file.name.endswith('.xlsx'):
82
+ df = pd.read_excel(file)
83
+
84
+ # Convert dataframe to text for document splitting (this could vary based on the structure of the data)
85
+ # Convert the whole dataframe to string without index
86
+ text = df.to_string(index=False)
87
+
88
+ # Wrap the string into a Document object
89
+ document = Document(page_content=text, metadata={"source": file.name})
90
+
91
+ # Create the splitter and split the document
92
+ splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
93
+ chunk_size=512,
94
+ chunk_overlap=256,
95
+ disallowed_special=(),
96
+ separators=["\n\n", "\n", " "]
97
+ )
98
+ split_docs.extend(splitter.split_documents([document]))
99
+ return split_docs
100
+
101
+ # Function to ingest documents into the vector database
102
+
103
+
104
+ def ingest_into_vectordb(split_docs):
105
+ embeddings = HuggingFaceEmbeddings(
106
+ model_name='sentence-transformers/all-MiniLM-L6-v2')
107
+ db = FAISS.from_documents(split_docs, embeddings)
108
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
109
+ db.save_local(DB_FAISS_PATH)
110
+ return db
111
+
112
+ # Function to get the conversation chain
113
+
114
+
115
+ def get_conversation_chain(retriever):
116
+ llm = Ollama(model="llama3.2:1b")
117
+ contextualize_q_system_prompt = (
118
+ "Given the chat history and the latest user question, "
119
+ "provide a response that directly addresses the user's query based on the provided documents. "
120
+ "Do not rephrase the question or ask follow-up questions."
121
+ )
122
+
123
+ contextualize_q_prompt = ChatPromptTemplate.from_messages(
124
+ [
125
+ ("system", contextualize_q_system_prompt),
126
+ MessagesPlaceholder("chat_history"),
127
+ ("human", "{input}"),
128
+ ]
129
+ )
130
+ history_aware_retriever = create_history_aware_retriever(
131
+ llm, retriever, contextualize_q_prompt
132
+ )
133
+
134
+ system_prompt = (
135
+ "As a personal chat assistant, provide accurate and relevant information based on the provided document in 2-3 sentences. "
136
+ "Answer should be limited to 50 words and 2-3 sentences. Do not prompt to select answers or formulate a stand-alone question."
137
+ "{context}"
138
+ )
139
+
140
+ qa_prompt = ChatPromptTemplate.from_messages(
141
+ [
142
+ ("system", system_prompt),
143
+ MessagesPlaceholder("chat_history"),
144
+ ("human", "{input}"),
145
+ ]
146
+ )
147
+ question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
148
+
149
+ rag_chain = create_retrieval_chain(
150
+ history_aware_retriever, question_answer_chain)
151
+
152
+ store = {}
153
+
154
+ def get_session_history(session_id: str) -> BaseChatMessageHistory:
155
+ if session_id not in store:
156
+ store[session_id] = ChatMessageHistory()
157
+ return store[session_id]
158
+
159
+ conversational_rag_chain = RunnableWithMessageHistory(
160
+ rag_chain,
161
+ get_session_history,
162
+ input_messages_key="input",
163
+ history_messages_key="chat_history",
164
+ output_messages_key="answer",
165
+ )
166
+ return conversational_rag_chain
167
+
168
+
169
+ def calculate_similarity_score(answer: str, context_docs: list) -> float:
170
+ model = SentenceTransformer('all-MiniLM-L6-v2')
171
+ context_docs = [doc.page_content for doc in context_docs]
172
+ answer_embedding = model.encode(answer, convert_to_tensor=True)
173
+ context_embeddings = model.encode(context_docs, convert_to_tensor=True)
174
+ similarities = util.pytorch_cos_sim(answer_embedding, context_embeddings)
175
+ max_score = similarities.max().item()
176
+ return max_score
177
+
178
+
179
+ st.title("What can I help with⁉️")
180
+
181
+ # Sidebar for file upload
182
+ uploaded_files = st.sidebar.file_uploader(
183
+ "Upload CSV/Excel Documents", type=["csv", "xlsx"], accept_multiple_files=True)
184
+
185
+ if uploaded_files:
186
+ if st.sidebar.button("Process Documents"):
187
+ split_docs = prepare_and_split_docs(uploaded_files)
188
+ vector_db = ingest_into_vectordb(split_docs)
189
+ retriever = vector_db.as_retriever()
190
+ st.sidebar.success("Documents processed and vector database created!")
191
+
192
+ # Initialize the conversation chain
193
+ conversational_chain = get_conversation_chain(retriever)
194
+ st.session_state.conversational_chain = conversational_chain
195
+
196
+ if 'chat_history' not in st.session_state:
197
+ st.session_state.chat_history = []
198
+
199
+ # Chat input
200
+ st.markdown(button_style, unsafe_allow_html=True)
201
+ user_input = st.text_input("Ask a question about the dataset:",
202
+ key="user_input", placeholder="Type your question here...")
203
+
204
+
205
+ if st.button("Submit"):
206
+ st.markdown(button_style, unsafe_allow_html=True)
207
+ if user_input and 'conversational_chain' in st.session_state:
208
+ session_id = "abc123"
209
+ conversational_chain = st.session_state.conversational_chain
210
+ response = conversational_chain.invoke({"input": user_input}, config={
211
+ "configurable": {"session_id": session_id}})
212
+ context_docs = response.get('context', [])
213
+ st.session_state.chat_history.append(
214
+ {"user": user_input, "bot": response['answer'], "context_docs": context_docs})
215
+
216
+ # Display chat history
217
+ if st.session_state.chat_history:
218
+ for message in st.session_state.chat_history:
219
+ st.markdown(user_template.format(
220
+ msg=message['user']), unsafe_allow_html=True)
221
+ st.markdown(bot_template.format(
222
+ msg=message['bot']), unsafe_allow_html=True)
Rag.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def rag_chatbot():
3
+ st.title("RAG Chatbot")
4
+
5
+ # Check if data is uploaded
6
+ if "data" in st.session_state and isinstance(st.session_state["data"], pd.DataFrame):
7
+ df = st.session_state["data"]
8
+
9
+ # Convert data to documents
10
+ st.write("Processing the dataset...")
11
+ documents = create_doucment(df)
12
+ st.write(f"Created {len(documents)} documents.")
13
+
14
+ # Load models
15
+ st.write("Loading models...")
16
+ embedding = load_models_embedding()
17
+ llm = load_models_llm()
18
+
19
+ # Create retriever
20
+ retriever = create_database(embedding, documents).as_retriever()
21
+
22
+ # Ask a question
23
+ question = st.text_input("Ask a question about your dataset:")
24
+ if question:
25
+ response = ask_me(question, retriever, llm)
26
+ st.write(f"Answer: {response}")
27
+ else:
28
+ st.warning("Please upload a dataset to proceed.")
Virtualization.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import plotly.express as px
6
+
7
+
8
+ def visualize_data():
9
+ st.title("Data Visualization")
10
+
11
+ if "data" in st.session_state:
12
+ df = st.session_state["data"]
13
+
14
+ chart_type = st.selectbox("Choose Chart Type", [
15
+ "Bar Chart", "Histogram", "Boxplot", "Doughnut Chart", "Pie Chart"])
16
+
17
+ columns = df.select_dtypes(include=['number']).columns.tolist()
18
+ selected_column = st.selectbox("Select Column", columns)
19
+
20
+ value_counts = df[selected_column].value_counts()
21
+
22
+ if chart_type == "Bar Chart":
23
+ if len(value_counts) > 20:
24
+ st.warning(
25
+ "Bar Chart is not suitable for more than 20 unique values. Please select a column with 20 or fewer unique values.")
26
+ else:
27
+ st.subheader(f"Bar Chart for {selected_column}")
28
+ fig, ax = plt.subplots()
29
+ df[selected_column].value_counts().plot(kind='bar', ax=ax)
30
+ st.pyplot(fig)
31
+
32
+ elif chart_type == "Histogram":
33
+ if len(value_counts) < 10:
34
+ st.warning(
35
+ "Histogram requires at least 10 unique values to be meaningful. Please select a column with more than 10 unique values.")
36
+ else:
37
+ st.subheader(f"Histogram for {selected_column}")
38
+ fig, ax = plt.subplots()
39
+ ax.hist(df[selected_column], bins=20, edgecolor="black")
40
+ ax.set_xlabel(selected_column)
41
+ ax.set_ylabel('Frequency')
42
+ st.pyplot(fig)
43
+
44
+ elif chart_type == "Boxplot":
45
+ if len(value_counts) < 5:
46
+ st.warning(
47
+ "Boxplot requires at least 5 unique values to show distribution. Please select a column with more than 5 unique values.")
48
+ else:
49
+ st.subheader(f"Boxplot for {selected_column}")
50
+ fig = plt.figure(figsize=(6, 4))
51
+ sns.boxplot(x=df[selected_column])
52
+ st.pyplot(fig)
53
+
54
+ elif chart_type == "Doughnut Chart":
55
+ if len(value_counts) > 5:
56
+ st.warning(
57
+ "Doughnut Chart is not suitable for more than 5 unique values. Please select a column with 5 or fewer unique values.")
58
+ else:
59
+ st.subheader(f"Doughnut Chart for {selected_column}")
60
+ fig = px.pie(value_counts, names=value_counts.index,
61
+ values=value_counts.values, hole=0.3)
62
+ st.plotly_chart(fig)
63
+
64
+ elif chart_type == "Pie Chart":
65
+ if len(value_counts) > 5:
66
+ st.warning(
67
+ "Pie Chart is not suitable for more than 5 unique values. Please select a column with 5 or fewer unique values.")
68
+ else:
69
+ st.subheader(f"Pie Chart for {selected_column}")
70
+ fig = px.pie(value_counts, names=value_counts.index,
71
+ values=value_counts.values)
72
+ st.plotly_chart(fig)
73
+
74
+ else:
75
+ st.warning("Please upload a dataset first.")
tempCodeRunnerFile.py ADDED
@@ -0,0 +1 @@
 
 
1
+ plotly.express