1MR commited on
Commit
5c802bc
·
verified ·
1 Parent(s): 13e10ec

Upload 4 files

Browse files
Files changed (4) hide show
  1. Information.py +61 -0
  2. Preprocessing1.py +132 -0
  3. Preprocessing2.py +222 -0
  4. RAG.py +72 -0
Information.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+
10
+
11
+
12
+
13
+ def show_general_data_statistics():
14
+ if "data" in st.session_state:
15
+ data = st.session_state["data"]
16
+ num_var = len(data.columns)
17
+ num_rows = len(data)
18
+ missing_cells = data.isnull().sum().sum()
19
+ missing_cells_percent = (missing_cells / (data.size)) * 100
20
+ duplicate_rows = data.duplicated().sum()
21
+ duplicate_rows_percent = (duplicate_rows / num_rows) * 100
22
+ var_types = data.dtypes.value_counts()
23
+
24
+ st.write("### General Data Statistics:")
25
+ st.write(f"- **Number of Variables:** {num_var}")
26
+ st.write(f"- **Number of Rows:** {num_rows}")
27
+ st.write(f"- **Missing Cells:** {missing_cells}")
28
+ st.write(f"- **Missing Cells (%):** {missing_cells_percent:.2f}%")
29
+ st.write(f"- **Duplicate Rows:** {duplicate_rows}")
30
+ st.write(f"- **Duplicate Rows (%):** {duplicate_rows_percent:.2f}%")
31
+ st.write("#### Variable Types:")
32
+ st.write(var_types)
33
+ else:
34
+ st.warning("Please upload a dataset first.")
35
+
36
+
37
+
38
+
39
+ def describe_data():
40
+ st.title("Describe Data")
41
+
42
+ if "data" in st.session_state:
43
+ data = st.session_state["data"]
44
+ st.write("Dataset Description:")
45
+ st.write(data.describe())
46
+ else:
47
+ st.warning("Please upload a dataset first.")
48
+
49
+
50
+ def info_data():
51
+ st.title("Dataset Info")
52
+
53
+ if "data" in st.session_state:
54
+ data = st.session_state["data"]
55
+ buffer = io.StringIO()
56
+ data.info(buf=buffer)
57
+ info = buffer.getvalue()
58
+ st.text(info)
59
+ else:
60
+ st.warning("Please upload a dataset first.")
61
+
Preprocessing1.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+
10
+
11
+ def preview_data():
12
+ if "data" in st.session_state:
13
+ data = st.session_state["data"]
14
+
15
+ st.write("### Dataset Preview Options:")
16
+
17
+ preview_option = st.radio(
18
+ "Select how to preview the dataset:",
19
+ options=["Head", "Tail", "Custom Number of Rows"],
20
+ index=0
21
+ )
22
+
23
+ if preview_option == "Head":
24
+ st.write("### First 5 Rows of the Dataset:")
25
+ st.dataframe(data.head())
26
+ elif preview_option == "Tail":
27
+ st.write("### Last 5 Rows of the Dataset:")
28
+ st.dataframe(data.tail())
29
+ elif preview_option == "Custom Number of Rows":
30
+ number = st.slider(
31
+ "Select Number of Rows to Display:", 1, len(data))
32
+ st.write(f"### First {number} Rows of the Dataset:")
33
+ st.dataframe(data.head(number))
34
+
35
+ # Show entire data
36
+ if st.checkbox("Show all data"):
37
+ st.write(data)
38
+
39
+ # Show column names
40
+ if st.checkbox("Show Column Names"):
41
+ st.write(data.columns)
42
+
43
+ # Show dataset dimensions (rows and columns)
44
+ if st.checkbox("Show Dimensions"):
45
+ st.write(data.shape)
46
+
47
+ else:
48
+ st.warning("Please upload a dataset to view options.")
49
+
50
+
51
+ def data_cleaning():
52
+ if "data" in st.session_state:
53
+ data = st.session_state["data"]
54
+
55
+ st.subheader("Data Cleaning")
56
+
57
+ col_option = st.selectbox("Choose your option", [
58
+ "Check all numeric features are numeric?", "Show unique values of categorical features"])
59
+
60
+ # Check and convert numeric columns
61
+ if col_option == "Check all numeric features are numeric?":
62
+ st.write("Converting all numeric columns to numeric types...")
63
+ numeric_columns = list(
64
+ data.select_dtypes(include=np.number).columns)
65
+ for col in numeric_columns:
66
+ data[col] = pd.to_numeric(data[col], errors='coerce')
67
+ st.write("Done!")
68
+
69
+ # Show unique values for categorical features
70
+ elif col_option == "Show unique values of categorical features":
71
+ st.write("Unique values for categorical features:")
72
+ for column in data.columns:
73
+ # check for categorical features (strings)
74
+ if data[column].dtype == object:
75
+ st.write(f"{column}: {data[column].unique()}")
76
+ st.write("====================================")
77
+
78
+ else:
79
+ st.warning("Please upload a dataset to perform data cleaning.")
80
+
81
+
82
+ def modify_column_names():
83
+ st.title("Modify Column Names")
84
+
85
+ # Ensure data exists in the session
86
+ if "data" in st.session_state:
87
+ df = st.session_state["data"]
88
+
89
+ st.write('### *Current Column Names*')
90
+ st.table(df.columns)
91
+
92
+ st.write('### *Modify Column Names*')
93
+ with st.expander("Modify Column Names", expanded=True):
94
+ before_col = st.session_state.get(
95
+ "modified_columns", list(df.columns))
96
+ before_col_df = pd.DataFrame(before_col, columns=['Column Name'])
97
+ st.table(before_col_df)
98
+
99
+ col3, col4, col5, col6 = st.columns(4)
100
+ if st.button('Convert to Uppercase'):
101
+ st.session_state.modified_columns = [
102
+ col.upper() for col in before_col]
103
+ if st.button('Convert to Lowercase'):
104
+ st.session_state.modified_columns = [
105
+ col.lower() for col in before_col]
106
+ if st.button('Replace Spaces with Underscore'):
107
+ st.session_state.modified_columns = [
108
+ col.replace(" ", "_") for col in before_col]
109
+ if st.button('Capitalize First Letters'):
110
+ st.session_state.modified_columns = [
111
+ col.title() for col in before_col]
112
+
113
+ df.columns = st.session_state.modified_columns
114
+
115
+ st.success("Changes applied successfully.")
116
+ st.table(pd.DataFrame(df.columns, columns=['Modified Columns']))
117
+
118
+ st.write("### *Modify a Specific Column Name*")
119
+ column_select = st.selectbox(
120
+ 'Select column to modify', options=st.session_state.modified_columns)
121
+ new_column_name = st.text_input('Enter new column name')
122
+ if st.button('Update Column Name'):
123
+ if column_select and new_column_name:
124
+ st.session_state.modified_columns = [
125
+ new_column_name if col == column_select else col for col in st.session_state.modified_columns]
126
+ df.columns = st.session_state.modified_columns
127
+ st.success("Column name updated.")
128
+ st.table(pd.DataFrame(
129
+ df.columns, columns=['Modified Columns']))
130
+
131
+ else:
132
+ st.warning("Please upload a dataset first.")
Preprocessing2.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+
10
+
11
+ def handle_categorical_values():
12
+ if "data" in st.session_state:
13
+ data = st.session_state["data"]
14
+
15
+ st.subheader("Handle Categorical Values")
16
+
17
+ categorical_cols_features = list(
18
+ data.select_dtypes(include="object").columns)
19
+
20
+ # One-Hot Encoding for nominal categorical features
21
+ one_hot_enc = st.multiselect(
22
+ "Select nominal categorical columns", categorical_cols_features)
23
+
24
+ # Apply one-hot encoding to selected columns
25
+ if one_hot_enc:
26
+ for column in one_hot_enc:
27
+ if data[column].dtype == 'object': # Only apply to categorical/string columns
28
+ data = pd.get_dummies(data, columns=[column])
29
+ st.write("### Data after One-Hot Encoding:")
30
+ st.write(data.head())
31
+
32
+ # Label Encoding for ordinal categorical features
33
+ label_encoder = LabelEncoder()
34
+ label_enc = st.multiselect(
35
+ "Select ordinal categorical columns", categorical_cols_features)
36
+
37
+ # Apply label encoding to selected columns
38
+ if label_enc:
39
+ for column in label_enc:
40
+ if data[column].dtype == 'object': # Only apply to categorical/string columns
41
+ data[column] = label_encoder.fit_transform(data[column])
42
+ st.write("### Data after Label Encoding:")
43
+ st.write(data.head())
44
+
45
+ else:
46
+ st.warning("Please upload a dataset to handle categorical values.")
47
+
48
+
49
+ def missing_values():
50
+ st.title("Handle Missing Values")
51
+
52
+ if "data" in st.session_state:
53
+ data = st.session_state["data"].copy()
54
+
55
+ action = st.selectbox(
56
+ "Select Action", ["Drop", "Dropna", "Fill missing val"])
57
+
58
+ column = st.selectbox("Select Column", data.columns)
59
+
60
+ # Before Visualization
61
+ st.write("### Before:")
62
+ st.dataframe(data)
63
+
64
+ # Placeholder for After Visualization
65
+ after_placeholder = st.empty()
66
+
67
+ if st.button("OK"):
68
+ modified_data = data.copy()
69
+
70
+ if action == "Drop":
71
+ modified_data.drop(columns=[column], inplace=True)
72
+ elif action == "Dropna":
73
+ modified_data.dropna(subset=[column], inplace=True)
74
+ elif action == "Fill missing val":
75
+
76
+ fill_method = st.selectbox(
77
+ "Select fill method", ["Mean", "Mode", "Median"])
78
+
79
+ if fill_method == "Mean":
80
+ fill_value = data[column].mean()
81
+ elif fill_method == "Mode":
82
+ fill_value = data[column].mode()[0]
83
+ elif fill_method == "Median":
84
+ fill_value = data[column].median()
85
+
86
+ modified_data[column].fillna(fill_value, inplace=True)
87
+
88
+ # After Visualization
89
+ after_placeholder.write("### After:")
90
+ after_placeholder.dataframe(modified_data)
91
+
92
+ st.session_state["data"] = modified_data
93
+ else:
94
+ st.warning("Please upload a dataset first.")
95
+
96
+
97
+ def handle_duplicates():
98
+ st.title("Handle Duplicates")
99
+
100
+ if "data" in st.session_state:
101
+ data = st.session_state["data"].copy()
102
+
103
+ action = st.selectbox(
104
+ "Select Action", ["Drop Duplicates", "Drop Duplicates in Column", "Keep First", "Keep Last"])
105
+
106
+ if action in ["Drop Duplicates in Column", "Keep First", "Keep Last"]:
107
+ column = st.selectbox("Select Column", data.columns)
108
+ else:
109
+ column = None
110
+
111
+ # Before Visualization
112
+ st.write("### Before:")
113
+ st.dataframe(data)
114
+
115
+ # Placeholder for After Visualization
116
+ after_placeholder = st.empty()
117
+
118
+ if st.button("OK"):
119
+ modified_data = data.copy()
120
+
121
+ if action == "Drop Duplicates":
122
+ modified_data.drop_duplicates(inplace=True)
123
+
124
+ elif action == "Drop Duplicates in Column":
125
+ modified_data.drop_duplicates(subset=[column], inplace=True)
126
+
127
+ elif action == "Keep First":
128
+ # Keep the first occurrence of duplicates and drop others
129
+ modified_data.drop_duplicates(
130
+ subset=[column], keep="first", inplace=True)
131
+
132
+ elif action == "Keep Last":
133
+ # Keep the last occurrence of duplicates and drop others
134
+ modified_data.drop_duplicates(
135
+ subset=[column], keep="last", inplace=True)
136
+
137
+ # After Visualization
138
+ after_placeholder.write("### After:")
139
+ after_placeholder.dataframe(modified_data)
140
+
141
+ st.session_state["data"] = modified_data
142
+ else:
143
+ st.warning("Please upload a dataset first.")
144
+
145
+
146
+ def handle_outliers():
147
+ st.title("Handle Outliers")
148
+
149
+ if "data" in st.session_state:
150
+ data = st.session_state["data"].copy()
151
+
152
+ column = st.selectbox("Select Column", data.select_dtypes(
153
+ include=[np.number]).columns)
154
+
155
+ action = st.selectbox(
156
+ "Select Action",
157
+ ["Remove Outliers (IQR)", "Set Bounds Manually",
158
+ "Replace Outliers"]
159
+ )
160
+
161
+ st.write("### Before:")
162
+ st.dataframe(data)
163
+
164
+ after_placeholder = st.empty()
165
+
166
+ if st.button("OK"):
167
+ modified_data = data.copy()
168
+
169
+ if action == "Remove Outliers (IQR)":
170
+ Q1 = data[column].quantile(0.25)
171
+ Q3 = data[column].quantile(0.75)
172
+ IQR = Q3 - Q1
173
+ lower_bound = Q1 - 1.5 * IQR
174
+ upper_bound = Q3 + 1.5 * IQR
175
+
176
+ # Remove outliers
177
+ modified_data = modified_data[
178
+ (modified_data[column] >= lower_bound) & (
179
+ modified_data[column] <= upper_bound)
180
+ ]
181
+
182
+ elif action == "Set Bounds Manually":
183
+ # User inputs for bounds
184
+ lower_bound = st.number_input(
185
+ f"Set lower bound for {column}", value=float(data[column].min()))
186
+ upper_bound = st.number_input(
187
+ f"Set upper bound for {column}", value=float(data[column].max()))
188
+
189
+ # Remove rows outside the bounds
190
+ modified_data = modified_data[
191
+ (modified_data[column] >= lower_bound) & (
192
+ modified_data[column] <= upper_bound)
193
+ ]
194
+
195
+ elif action == "Replace Outliers":
196
+
197
+ Q1 = data[column].quantile(0.25)
198
+ Q3 = data[column].quantile(0.75)
199
+ IQR = Q3 - Q1
200
+ lower_bound = Q1 - 1.5 * IQR
201
+ upper_bound = Q3 + 1.5 * IQR
202
+
203
+ replace_method = st.radio(
204
+ "Select Replacement Method",
205
+ ["Mean", "Median"]
206
+ )
207
+ if replace_method == "Mean":
208
+ replacement_value = data[column].mean()
209
+ else:
210
+ replacement_value = data[column].median()
211
+
212
+ # Replace outliers
213
+ modified_data[column] = modified_data[column].apply(
214
+ lambda x: replacement_value if x < lower_bound or x > upper_bound else x
215
+ )
216
+
217
+ after_placeholder.write("### After:")
218
+ after_placeholder.dataframe(modified_data)
219
+
220
+ st.session_state["data"] = modified_data
221
+ else:
222
+ st.warning("Please upload a dataset first.")
RAG.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ from langchain.docstore.document import Document
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.llms import HuggingFaceHub
7
+ from langchain.chains import RetrievalQA
8
+
9
+
10
+ # file_path = "thyroidDF.csv"
11
+ # df = pd.read_csv(file_path)
12
+
13
+ def create_doucment(df):
14
+ documents = [
15
+ Document(
16
+ metadata={"id": str(i)},
17
+ # Serialize the dictionary to a JSON string
18
+ page_content=json.dumps(row.to_dict())
19
+ )
20
+ for i, row in df.iterrows()
21
+ ]
22
+ return documents
23
+
24
+
25
+ def load_models_embedding():
26
+ embeddings = HuggingFaceEmbeddings(
27
+ model_name="sentence-transformers/all-MiniLM-L6-v2")
28
+ return embeddings
29
+
30
+
31
+ def load_models_llm():
32
+ llm = HuggingFaceHub(
33
+ repo_id="Qwen/Qwen2.5-72B-Instruct",
34
+ # Replace with your token
35
+ api="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22"
36
+ huggingfacehub_api_token=api[:-2],
37
+ model_kwargs={"temperature": 0.5,
38
+ "max_length": 100} # Faster inference
39
+ )
40
+ return llm
41
+
42
+
43
+ def create_database(embedding, documents):
44
+ vector_store = Chroma.from_documents(documents, embedding=embedding)
45
+ return vector_store
46
+
47
+ # retriever = create_database().as_retriever()
48
+
49
+
50
+ def ask_me(question, retriever, llm):
51
+
52
+ qa_chain = RetrievalQA.from_chain_type(
53
+ retriever=retriever,
54
+ chain_type="stuff",
55
+ llm=load_models_llm(),
56
+ return_source_documents=True)
57
+
58
+ response = qa_chain.invoke({"query": question})
59
+ print("Answer:", response["result"])
60
+
61
+
62
+ # qa_chain = RetrievalQA.from_chain_type(
63
+ # retriever=retriever,
64
+ # chain_type="stuff",
65
+ # llm=llm,
66
+ # return_source_documents=True
67
+ # )
68
+
69
+ # question = "Can you provide the TSH, T3, and FTI values for patients aged 55?"
70
+ # # question = "What columns are in the dataset?"
71
+ # response = qa_chain.invoke({"query": question})
72
+ # print("Answer:", response["result"])