MahatirTusher commited on
Commit
4e675ce
ยท
verified ยท
1 Parent(s): 593c364

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +951 -0
  2. requirements.txt +15 -0
app.py ADDED
@@ -0,0 +1,951 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from faker import Faker
4
+ import random
5
+ from groq import Groq
6
+ from io import BytesIO
7
+ import ast
8
+ import re
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ import plotly.express as px
12
+ import io
13
+ from datetime import datetime
14
+ from sklearn.preprocessing import LabelEncoder
15
+ import matplotlib
16
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
17
+ from sklearn.model_selection import train_test_split, GridSearchCV
18
+ from sklearn.linear_model import LogisticRegression, LinearRegression
19
+ from sklearn.svm import SVC, SVR
20
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
21
+ from sklearn.preprocessing import StandardScaler
22
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
23
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
24
+ from sklearn.naive_bayes import GaussianNB
25
+ from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
26
+ from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
27
+ import numpy as np
28
+ from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
29
+ mean_absolute_error, mean_squared_error, r2_score,
30
+ silhouette_score, davies_bouldin_score, calinski_harabasz_score)
31
+ from PyPDF2 import PdfReader
32
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
33
+ from langchain.embeddings import HuggingFaceEmbeddings
34
+ from langchain.vectorstores import FAISS
35
+ from langchain.memory import ConversationBufferMemory
36
+ from langchain.chains import ConversationalRetrievalChain
37
+ from langchain_community.llms import Groq as LangChainGroq
38
+ import torch
39
+ import os
40
+
41
+ # Conditional import for time series models
42
+ try:
43
+ from statsmodels.tsa.holtwinters import ExponentialSmoothing
44
+ from statsmodels.tsa.arima.model import ARIMA
45
+ HAS_STATSMODELS = True
46
+ except ImportError:
47
+ HAS_STATSMODELS = False
48
+
49
+ # Set matplotlib backend for Streamlit compatibility
50
+ matplotlib.use('Agg')
51
+
52
+ # Initialize Faker and apply custom styles
53
+ fake = Faker()
54
+
55
+ def add_custom_styles():
56
+ st.markdown(
57
+ """
58
+ <style>
59
+ @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap');
60
+ html, body, [class*="css"] {
61
+ font-family: 'Roboto', sans-serif;
62
+ background-color: #f4f4f9;
63
+ }
64
+ .stButton>button {
65
+ background-color: #4CAF50;
66
+ color: white;
67
+ border: none;
68
+ padding: 10px 20px;
69
+ border-radius: 5px;
70
+ font-size: 16px;
71
+ }
72
+ .stButton>button:hover {
73
+ background-color: #45a049;
74
+ }
75
+ .header-banner {
76
+ text-align: center;
77
+ margin-bottom: 20px;
78
+ }
79
+ .header-banner img {
80
+ max-width: 150px;
81
+ margin-bottom: 10px;
82
+ }
83
+ .header-banner h1 {
84
+ font-size: 36px;
85
+ color: #333;
86
+ margin: 0;
87
+ }
88
+ .header-banner p {
89
+ font-size: 16px;
90
+ color: #666;
91
+ }
92
+ footer {
93
+ text-align: center;
94
+ margin-top: 50px;
95
+ padding: 10px;
96
+ font-size: 14px;
97
+ color: #888;
98
+ }
99
+ footer a {
100
+ color: #4CAF50;
101
+ text-decoration: none;
102
+ }
103
+ footer a:hover {
104
+ text-decoration: underline;
105
+ }
106
+ </style>
107
+ """,
108
+ unsafe_allow_html=True
109
+ )
110
+
111
+ def add_header():
112
+ st.markdown(
113
+ """
114
+ <div class="header-banner">
115
+ <img src="https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png" alt="DataGenie Logo" style="max-width: 280px;">
116
+ <p>Empowering your data journey with AI-driven insights and synthetic datasets</p>
117
+ </div>
118
+ """,
119
+ unsafe_allow_html=True
120
+ )
121
+ st.markdown("### Upload Your Dataset for Preprocessing, Training, and EDA")
122
+ uploaded_file = st.file_uploader("Upload CSV", type="csv")
123
+ if uploaded_file:
124
+ try:
125
+ df = pd.read_csv(uploaded_file)
126
+ st.success("Dataset uploaded successfully!")
127
+ st.session_state['uploaded_df'] = df
128
+ st.write("Preview of the uploaded dataset:")
129
+ st.dataframe(df.head())
130
+ except Exception as e:
131
+ st.error(f"Error loading CSV file: {str(e)}")
132
+ else:
133
+ st.info("Upload a CSV file to get started.")
134
+
135
+ def add_footer():
136
+ st.markdown(
137
+ """
138
+ <footer>
139
+ Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>.
140
+ Inspired by the project "Predicta" by Ahmed Nafiz.
141
+ </footer>
142
+ """,
143
+ unsafe_allow_html=True
144
+ )
145
+
146
+ def add_sidebar():
147
+ st.sidebar.image(
148
+ "https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png",
149
+ width=150,
150
+ caption="DataGenie"
151
+ )
152
+ st.sidebar.markdown("---")
153
+ st.sidebar.title("About DataGenie")
154
+ st.sidebar.info(
155
+ "DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration."
156
+ )
157
+ st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher")
158
+ st.sidebar.write("**Inspired by:** Predicta by Ahmed Nafiz")
159
+ st.sidebar.markdown("---")
160
+ st.sidebar.write("**Your**")
161
+ st.sidebar.image(
162
+ "https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png",
163
+ width=150
164
+ )
165
+
166
+ # App configuration
167
+ APP_NAME = "DataGenie"
168
+
169
+ # Initialize Groq client with API key
170
+ try:
171
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
172
+ except Exception as e:
173
+ st.error(f"Invalid Groq API key: {str(e)}. Please set GROQ_API_KEY in environment variables.")
174
+ st.stop()
175
+
176
+ # Utility functions
177
+ def extract_row_count(prompt):
178
+ match = re.search(r'(\d+)\s*(rows|records|entries)', prompt, re.IGNORECASE)
179
+ return int(match.group(1)) if match else 100
180
+
181
+ def generate_dataset_code(prompt):
182
+ try:
183
+ chat_completion = client.chat.completions.create(
184
+ messages=[
185
+ {
186
+ "role": "system",
187
+ "content": (
188
+ "You are an expert Python code generator specializing in creating synthetic datasets using pandas, faker, and random. "
189
+ "Based on the user's natural language prompt, generate a valid Python function named `create_dataset()` that returns a pandas DataFrame. "
190
+ "Follow these strict rules:\n"
191
+ "1. The function must start exactly with `def create_dataset():` and take no arguments.\n"
192
+ "2. Use only `pd` (pandas), `fake` (Faker), and `random` (random module) within the function.\n"
193
+ "3. Extract the number of rows from the prompt (e.g., '500 rows' or '1000 records') and use `range(<row_count>)` to generate exactly that many rows. If no row count is specified, default to 100 rows.\n"
194
+ "4. Generate realistic data for all columns specified in the prompt, respecting any domain-specific details (e.g., age between 18-80, prices in USD, regional names).\n"
195
+ "5. For target columns (e.g., 'yes/no', 'percentage', 'price', 'category'), use appropriate distributions or logic (e.g., random.choice(['Yes', 'No']), random.uniform(0, 100) for percentages).\n"
196
+ "6. Ensure data types are correct: integers for counts, floats for percentages/prices, strings for names/emails, etc.\n"
197
+ "7. The function must end with `return pd.DataFrame(data)` where `data` is a dictionary of column lists.\n"
198
+ "8. Do not include comments, markdown, explanations, or extra text outside the function definition.\n"
199
+ "Example for prompt 'Generate 200 rows of customer data with name, age, email, and purchase_amount':\n"
200
+ "def create_dataset():\n"
201
+ " data = {\n"
202
+ " 'name': [fake.name() for _ in range(200)],\n"
203
+ " 'age': [random.randint(18, 80) for _ in range(200)],\n"
204
+ " 'email': [fake.email() for _ in range(200)],\n"
205
+ " 'purchase_amount': [round(random.uniform(10.0, 500.0), 2) for _ in range(200)]\n"
206
+ " }\n"
207
+ " return pd.DataFrame(data)\n"
208
+ "Handle edge cases gracefully, such as missing column details, by using reasonable defaults. "
209
+ "Ensure the code is syntactically correct and executable. Remember, in case of classification yes means 1 and no means 0."
210
+ ),
211
+ },
212
+ {"role": "user", "content": prompt},
213
+ ],
214
+ model="llama-3.3-70b-versatile",
215
+ )
216
+ code = chat_completion.choices[0].message.content.strip()
217
+ if not code.startswith("def create_dataset():"):
218
+ st.error("Generated code does not define create_dataset function correctly.")
219
+ st.code(code, language="python")
220
+ return None
221
+ try:
222
+ ast.parse(code)
223
+ return code
224
+ except SyntaxError as e:
225
+ st.error(f"Invalid syntax in generated code: {str(e)}")
226
+ st.code(code, language="python")
227
+ return None
228
+ except Exception as e:
229
+ st.error(f"Error with Groq API: {str(e)}")
230
+ return None
231
+
232
+ def execute_code(code):
233
+ safe_globals = {
234
+ "pd": pd,
235
+ "fake": fake,
236
+ "random": random,
237
+ "__builtins__": {
238
+ "range": range, "list": list, "int": int, "str": str, "float": float,
239
+ "round": round, "True": True, "False": False, "zip": zip,
240
+ },
241
+ }
242
+ safe_locals = {}
243
+ try:
244
+ exec(code, safe_globals, safe_locals)
245
+ create_dataset = safe_locals.get("create_dataset")
246
+ if not create_dataset:
247
+ st.error("No create_dataset function defined.")
248
+ return None
249
+ df = create_dataset()
250
+ if not isinstance(df, pd.DataFrame):
251
+ st.error("Generated code did not return a pandas DataFrame.")
252
+ return None
253
+ return df
254
+ except Exception as e:
255
+ st.error(f"Execution error: {str(e)}")
256
+ return None
257
+
258
+ def to_csv_bytes(df):
259
+ output = BytesIO()
260
+ df.to_csv(output, index=False)
261
+ output.seek(0)
262
+ return output
263
+
264
+ # Visualization functions
265
+ def visualize_dataset(df):
266
+ st.subheader("Dataset Visualizations")
267
+ if df.empty or not isinstance(df, pd.DataFrame):
268
+ st.warning("No valid data to visualize.")
269
+ return
270
+
271
+ numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
272
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
273
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
274
+ all_cols = numerical_cols + categorical_cols + datetime_cols
275
+ if not all_cols:
276
+ st.warning("No columns available to visualize.")
277
+ return
278
+
279
+ viz_type = st.sidebar.selectbox("Select Visualization Type",
280
+ ["Histogram", "Box Plot", "Scatter Plot", "Count Plot",
281
+ "Correlation Heatmap"] + (["Time Series"] if datetime_cols and numerical_cols else []))
282
+ plt.clf()
283
+
284
+ try:
285
+ if viz_type == "Histogram" and numerical_cols:
286
+ col = st.sidebar.selectbox("Select Numerical Column", numerical_cols)
287
+ fig, ax = plt.subplots()
288
+ sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax)
289
+ st.pyplot(fig)
290
+ download_image(fig, f"histogram_{col}")
291
+ plt.close(fig)
292
+
293
+ elif viz_type == "Box Plot" and numerical_cols:
294
+ col = st.sidebar.selectbox("Select Numerical Column", numerical_cols)
295
+ fig, ax = plt.subplots()
296
+ sns.boxplot(data=df, y=col, ax=ax)
297
+ st.pyplot(fig)
298
+ download_image(fig, f"boxplot_{col}")
299
+ plt.close(fig)
300
+
301
+ elif viz_type == "Scatter Plot" and len(numerical_cols) >= 2:
302
+ x_col = st.sidebar.selectbox("Select X-axis Column", numerical_cols)
303
+ y_col = st.sidebar.selectbox("Select Y-axis Column", [c for c in numerical_cols if c != x_col])
304
+ fig = px.scatter(df, x=x_col, y=y_col)
305
+ st.plotly_chart(fig)
306
+ img_bytes = io.BytesIO()
307
+ fig.write_image(img_bytes, format='png')
308
+ st.sidebar.download_button("Download Scatter Plot", img_bytes.getvalue(),
309
+ file_name=f"scatter_{x_col}_{y_col}.png",
310
+ key=f"scatter_{x_col}_{y_col}_{datetime.now().strftime('%H%M%S')}")
311
+
312
+ elif viz_type == "Count Plot" and categorical_cols:
313
+ col = st.sidebar.selectbox("Select Categorical Column", categorical_cols)
314
+ fig, ax = plt.subplots()
315
+ sns.countplot(data=df, x=col, ax=ax)
316
+ plt.xticks(rotation=45, ha='right')
317
+ st.pyplot(fig)
318
+ download_image(fig, f"countplot_{col}")
319
+ plt.close(fig)
320
+
321
+ elif viz_type == "Correlation Heatmap" and numerical_cols:
322
+ fig, ax = plt.subplots(figsize=(10, 8))
323
+ sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax)
324
+ st.pyplot(fig)
325
+ download_image(fig, "correlation_heatmap")
326
+ plt.close(fig)
327
+
328
+ elif viz_type == "Time Series" and datetime_cols and numerical_cols:
329
+ datetime_col = st.sidebar.selectbox("Select Datetime Column", datetime_cols)
330
+ value_col = st.sidebar.selectbox("Select Value Column", numerical_cols)
331
+ df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce')
332
+ fig = px.line(df, x=datetime_col, y=value_col)
333
+ st.plotly_chart(fig)
334
+ img_bytes = io.BytesIO()
335
+ fig.write_image(img_bytes, format='png')
336
+ st.sidebar.download_button("Download Time Series", img_bytes.getvalue(),
337
+ file_name=f"time_series_{datetime_col}_{value_col}.png",
338
+ key=f"timeseries_{datetime_col}_{value_col}_{datetime.now().strftime('%H%M%S')}")
339
+ except Exception as e:
340
+ st.error(f"Visualization error: {str(e)}")
341
+
342
+ def visualize_specific_features(df, features):
343
+ st.subheader("Feature-Specific Visualizations")
344
+ for feature in features:
345
+ if feature not in df.columns:
346
+ st.warning(f"Feature '{feature}' not found.")
347
+ continue
348
+ fig, ax = plt.subplots()
349
+ try:
350
+ if pd.api.types.is_numeric_dtype(df[feature]):
351
+ sns.histplot(data=df, x=feature, kde=True, bins='auto', ax=ax)
352
+ elif pd.api.types.is_categorical_dtype(df[feature]) or pd.api.types.is_string_dtype(df[feature]):
353
+ sns.countplot(data=df, x=feature, ax=ax)
354
+ plt.xticks(rotation=45, ha='right')
355
+ elif pd.api.types.is_datetime64_any_dtype(df[feature]):
356
+ st.warning(f"Use 'Time Series' in main visualization for '{feature}'.")
357
+ plt.close(fig)
358
+ continue
359
+ st.pyplot(fig)
360
+ download_image(fig, f"feature_{feature}")
361
+ plt.close(fig)
362
+ except Exception as e:
363
+ st.error(f"Error visualizing '{feature}': {str(e)}")
364
+ plt.close(fig)
365
+
366
+ def download_image(fig, key_prefix):
367
+ img_bytes = io.BytesIO()
368
+ fig.savefig(img_bytes, format='png', bbox_inches='tight')
369
+ img_bytes.seek(0)
370
+ st.sidebar.download_button(label="Download Image", data=img_bytes,
371
+ file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
372
+ mime="image/png",
373
+ key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}")
374
+
375
+ # Data processing functions
376
+ def dataset_overview(df):
377
+ st.subheader("Dataset Overview")
378
+ st.write(f"Rows: {len(df)}, Columns: {len(df.columns)}")
379
+ st.write("Data Types:", df.dtypes)
380
+ st.write(df.head())
381
+
382
+ def clean_data(df):
383
+ st.subheader("Clean Data")
384
+ cleaned_df = df.dropna().drop_duplicates()
385
+ st.write("Cleaned Dataset:", cleaned_df.head())
386
+ return cleaned_df
387
+
388
+ def detect_outlier(df):
389
+ st.subheader("Detect Outliers")
390
+ numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
391
+ for col in numerical_cols:
392
+ Q1, Q3 = df[col].quantile([0.25, 0.75])
393
+ IQR = Q3 - Q1
394
+ outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
395
+ if not outliers.empty:
396
+ st.write(f"Outliers in {col}:", outliers)
397
+
398
+ def encoder(df):
399
+ st.subheader("Encode Data")
400
+ le = LabelEncoder()
401
+ encoded_df = df.copy()
402
+ for col in df.select_dtypes(include=['object', 'category']).columns:
403
+ encoded_df[col] = le.fit_transform(df[col])
404
+ st.write("Encoded Dataset:", encoded_df.head())
405
+ return encoded_df
406
+
407
+ def data_transformer(df):
408
+ st.subheader("Data Transformer")
409
+ transformed_df = df.copy() # Placeholder for future transformations
410
+ st.write("Transformed Dataset:", transformed_df.head())
411
+ return transformed_df
412
+
413
+ def data_analysis(df):
414
+ st.subheader("Data Analysis")
415
+ st.write(df.describe())
416
+
417
+ def feature_importance_analyzer(df):
418
+ st.subheader("Feature Importance Analyzer")
419
+ target_column = st.selectbox("Select Target Column", df.columns)
420
+ feature_columns = [col for col in df.columns if col != target_column]
421
+ if not feature_columns:
422
+ st.warning("No features available.")
423
+ return
424
+
425
+ X = pd.get_dummies(df[feature_columns], drop_first=True)
426
+ y = df[target_column]
427
+ if y.dtype in ['object', 'category']:
428
+ y = LabelEncoder().fit_transform(y)
429
+
430
+ try:
431
+ model = RandomForestClassifier(random_state=42) if y.nunique() <= 10 else RandomForestRegressor(random_state=42)
432
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
433
+ model.fit(X_train, y_train)
434
+ importance_df = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_}).sort_values(by="Importance", ascending=False)
435
+ st.write("Feature Importances:", importance_df)
436
+ fig, ax = plt.subplots(figsize=(10, 6))
437
+ sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis", ax=ax)
438
+ st.pyplot(fig)
439
+ download_image(fig, "feature_importance")
440
+ plt.close(fig)
441
+ except Exception as e:
442
+ st.error(f"Error analyzing features: {str(e)}")
443
+
444
+ def best_parameter_selector(df):
445
+ st.subheader("Best Parameter Selector")
446
+ task_type = st.selectbox("Select Task Type", ["Classification", "Regression"])
447
+ target_column = st.selectbox("Select Target Column", df.columns)
448
+ feature_columns = [col for col in df.columns if col != target_column]
449
+ if not feature_columns:
450
+ st.warning("No features available.")
451
+ return
452
+
453
+ X = pd.get_dummies(df[feature_columns], drop_first=True)
454
+ y = df[target_column]
455
+ if task_type == "Classification" and y.dtype in ['object', 'category']:
456
+ y = LabelEncoder().fit_transform(y)
457
+
458
+ model_options = {
459
+ "Classification": {
460
+ "Logistic Regression": (LogisticRegression, {"C": [0.01, 0.1, 1], "max_iter": [100, 200]}),
461
+ "Random Forest": (RandomForestClassifier, {"n_estimators": [50, 100], "max_depth": [None, 10]}),
462
+ "SVM": (SVC, {"C": [0.1, 1], "kernel": ["rbf", "linear"]})
463
+ },
464
+ "Regression": {
465
+ "Linear Regression": (LinearRegression, {}),
466
+ "Random Forest": (RandomForestRegressor, {"n_estimators": [50, 100], "max_depth": [None, 10]}),
467
+ "SVR": (SVR, {"C": [0.1, 1], "epsilon": [0.1, 0.2]})
468
+ }
469
+ }
470
+ model_name = st.selectbox("Select Model", list(model_options[task_type].keys()))
471
+ model_class, param_grid = model_options[task_type][model_name]
472
+ model = model_class(random_state=42) if "random_state" in model_class.__init__.__code__.co_varnames else model_class()
473
+
474
+ for param, values in param_grid.items():
475
+ new_values = st.text_input(f"Values for {param} (comma-separated)", ",".join(map(str, values)) if values else "")
476
+ if new_values:
477
+ param_grid[param] = [float(x) if '.' in x else int(x) for x in new_values.split(',')]
478
+
479
+ scoring = st.selectbox("Select Scoring Metric", ["accuracy", "f1"] if task_type == "Classification" else ["r2", "neg_mean_squared_error"])
480
+ try:
481
+ if param_grid:
482
+ with st.spinner("Performing GridSearchCV..."):
483
+ grid_search = GridSearchCV(model, param_grid, cv=3, scoring=scoring, n_jobs=-1)
484
+ grid_search.fit(X, y)
485
+ st.write("Best Parameters:", grid_search.best_params_)
486
+ st.write("Best Score:", grid_search.best_score_)
487
+ else:
488
+ model.fit(X, y)
489
+ st.write("Model trained with default parameters. Score:", model.score(X, y))
490
+ except Exception as e:
491
+ st.error(f"Parameter selection error: {str(e)}")
492
+
493
+ def select_ml_models(df):
494
+ st.subheader("Select ML Models")
495
+ analysis_type = st.selectbox("Select Analysis Type", ["Classification", "Regression", "Clustering", "Time Series"])
496
+
497
+ if analysis_type in ["Classification", "Regression"]:
498
+ target_col = st.selectbox("Select Target Variable", df.columns)
499
+ feature_cols = st.multiselect("Select Feature Columns", [col for col in df.columns if col != target_col])
500
+ if not feature_cols:
501
+ st.warning("Select at least one feature.")
502
+ return
503
+
504
+ X = pd.get_dummies(df[feature_cols])
505
+ y = df[target_col]
506
+
507
+ if analysis_type == "Classification":
508
+ if pd.api.types.is_float_dtype(y) or (pd.api.types.is_numeric_dtype(y) and y.nunique() > len(y) // 10):
509
+ st.error(
510
+ f"Target column '{target_col}' appears to be continuous (float or many unique values: {y.nunique()}). "
511
+ "Classification requires discrete labels (e.g., 'Yes/No', integers with few unique values). "
512
+ "Please select a categorical target, bin this column, or choose 'Regression' for continuous targets."
513
+ )
514
+ return
515
+ if y.dtype in ['object', 'category'] or pd.api.types.is_string_dtype(y):
516
+ y = LabelEncoder().fit_transform(y)
517
+ elif analysis_type == "Regression":
518
+ if not pd.api.types.is_numeric_dtype(y):
519
+ st.error(
520
+ f"Target column '{target_col}' is not numeric (type: {y.dtype}). "
521
+ "Regression requires a numeric target (e.g., float or integer). "
522
+ "Please select a numeric target or preprocess the data."
523
+ )
524
+ return
525
+
526
+ model_options = {
527
+ "Classification": {
528
+ "Logistic Regression": LogisticRegression(random_state=42),
529
+ "Random Forest": RandomForestClassifier(random_state=42),
530
+ "SVM": SVC(random_state=42),
531
+ "KNN": KNeighborsClassifier()
532
+ },
533
+ "Regression": {
534
+ "Linear Regression": LinearRegression(),
535
+ "Random Forest": RandomForestRegressor(random_state=42),
536
+ "SVR": SVR(),
537
+ "Decision Tree": DecisionTreeRegressor(random_state=42)
538
+ }
539
+ }[analysis_type]
540
+
541
+ selected_model = st.selectbox("Select Model", list(model_options.keys()))
542
+ if st.button("Train Model"):
543
+ with st.spinner("Training model..."):
544
+ try:
545
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
546
+ model = model_options[selected_model]
547
+ model.fit(X_train, y_train)
548
+ y_pred = model.predict(X_test)
549
+ metrics = {
550
+ "Classification": {
551
+ "Accuracy": accuracy_score(y_test, y_pred),
552
+ "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
553
+ "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
554
+ "F1 Score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
555
+ },
556
+ "Regression": {
557
+ "MAE": mean_absolute_error(y_test, y_pred),
558
+ "MSE": mean_squared_error(y_test, y_pred),
559
+ "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
560
+ "Rยฒ": r2_score(y_test, y_pred)
561
+ }
562
+ }[analysis_type]
563
+ st.write("Model Performance:", pd.DataFrame(metrics.items(), columns=["Metric", "Value"]))
564
+ except Exception as e:
565
+ st.error(f"Training error: {str(e)}")
566
+
567
+ elif analysis_type == "Clustering":
568
+ feature_cols = st.multiselect("Select Features for Clustering", df.columns)
569
+ if not feature_cols:
570
+ st.warning("Select at least one feature.")
571
+ return
572
+
573
+ X = pd.get_dummies(df[feature_cols])
574
+ n_clusters = st.slider("Number of Clusters", 2, 10, 3)
575
+ clustering_models = {
576
+ "K-Means": KMeans(n_clusters=n_clusters, random_state=42),
577
+ "DBSCAN": DBSCAN(eps=0.5, min_samples=5),
578
+ "Agglomerative": AgglomerativeClustering(n_clusters=n_clusters)
579
+ }
580
+ selected_model = st.selectbox("Select Clustering Algorithm", list(clustering_models.keys()))
581
+ if st.button("Perform Clustering"):
582
+ with st.spinner("Performing clustering..."):
583
+ X_scaled = StandardScaler().fit_transform(X)
584
+ model = clustering_models[selected_model]
585
+ clusters = model.fit_predict(X_scaled)
586
+ df_clusters = df.copy()
587
+ df_clusters['Cluster'] = clusters
588
+ st.write("Clustered Data Sample:", df_clusters.head())
589
+ if selected_model != "DBSCAN":
590
+ metrics = {
591
+ "Silhouette": silhouette_score(X_scaled, clusters),
592
+ "Davies-Bouldin": davies_bouldin_score(X_scaled, clusters),
593
+ "Calinski-Harabasz": calinski_harabasz_score(X_scaled, clusters)
594
+ }
595
+ st.write("Clustering Metrics:", pd.DataFrame(metrics.items(), columns=["Metric", "Value"]))
596
+
597
+ elif analysis_type == "Time Series":
598
+ if not HAS_STATSMODELS:
599
+ st.error("Install statsmodels: `pip install statsmodels`")
600
+ return
601
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns
602
+ if not datetime_cols.empty:
603
+ date_col = st.selectbox("Select Date Column", datetime_cols)
604
+ value_col = st.selectbox("Select Value Column", df.select_dtypes(include=['float64', 'int64']).columns)
605
+ forecast_models = {"Exponential Smoothing": ExponentialSmoothing, "ARIMA": ARIMA}
606
+ selected_model = st.selectbox("Select Forecasting Model", list(forecast_models.keys()))
607
+ if st.button("Analyze Time Series"):
608
+ with st.spinner("Analyzing time series..."):
609
+ ts_df = df.sort_values(date_col)
610
+ train_size = int(len(ts_df) * 0.8)
611
+ train, test = ts_df[:train_size], ts_df[train_size:]
612
+ if selected_model == "Exponential Smoothing":
613
+ model = ExponentialSmoothing(train[value_col], trend='add', seasonal='add', seasonal_periods=12).fit()
614
+ else:
615
+ model = ARIMA(train[value_col], order=(1, 1, 1)).fit()
616
+ forecast = model.forecast(steps=len(test))
617
+ metrics = {
618
+ "MAE": mean_absolute_error(test[value_col], forecast),
619
+ "MSE": mean_squared_error(test[value_col], forecast),
620
+ "RMSE": np.sqrt(mean_squared_error(test[value_col], forecast)),
621
+ "MAPE": np.mean(np.abs((test[value_col] - forecast) / test[value_col])) * 100
622
+ }
623
+ st.write("Forecasting Metrics:", pd.DataFrame(metrics.items(), columns=["Metric", "Value"]))
624
+
625
+ def clear_modified_dataset():
626
+ st.subheader("Clear Modified Dataset")
627
+ st.session_state.pop('uploaded_df', None)
628
+ st.write("Dataset cleared.")
629
+
630
+ def chat_with_dataset(df):
631
+ st.subheader("Chat with Your Dataset")
632
+ st.write("Ask questions about your dataset. For example, 'What is the average value of column X?' or 'Show me the top 5 rows.'")
633
+
634
+ user_query = st.text_area("Enter your query:", height=100)
635
+ if st.button("Ask"):
636
+ if not user_query.strip():
637
+ st.warning("Please enter a query.")
638
+ return
639
+
640
+ try:
641
+ chat_completion = client.chat.completions.create(
642
+ messages=[
643
+ {
644
+ "role": "system",
645
+ "content": (
646
+ "You are an expert data analyst. Answer the user's questions about the provided pandas DataFrame. "
647
+ "Use Python pandas to analyze the data and provide concise answers. "
648
+ "If the user asks for code, generate Python code snippets using pandas to perform the requested operation. "
649
+ "Do not include explanations unless explicitly requested."
650
+ ),
651
+ },
652
+ {"role": "user", "content": f"The dataset is:\n{df.head(5).to_string()}\n\n{user_query}"},
653
+ ],
654
+ model="llama-3.3-70b-versatile",
655
+ )
656
+ response = chat_completion.choices[0].message.content.strip()
657
+ st.write("Response:")
658
+ st.code(response, language="python" if "def " in response or "import " in response else None)
659
+
660
+ st.write("You can execute the generated code below:")
661
+ if st.button("Execute Generated Code"):
662
+ try:
663
+ safe_globals = {"pd": pd, "plt": plt, "sns": sns, "df": df, "io": io, "np": np}
664
+ safe_locals = {}
665
+ exec(response, safe_globals, safe_locals)
666
+
667
+ # Check for matplotlib or seaborn plots
668
+ if "plt." in response or "sns." in response:
669
+ st.pyplot(plt.gcf())
670
+ plt.clf()
671
+
672
+ # Check for DataFrame outputs
673
+ elif "pd.DataFrame" in response or "df" in response:
674
+ output_df = safe_locals.get("df", None)
675
+ if isinstance(output_df, pd.DataFrame):
676
+ st.write("Generated DataFrame:")
677
+ st.dataframe(output_df)
678
+ else:
679
+ st.write("Code executed successfully. Check the output above if applicable.")
680
+ else:
681
+ st.write("Code executed successfully. Check the output above if applicable.")
682
+ except Exception as e:
683
+ st.error(f"Error executing code: {str(e)}")
684
+ except Exception as e:
685
+ st.error(f"Error with Groq API: {str(e)}")
686
+
687
+ def process_paper_with_rag(uploaded_paper):
688
+ try:
689
+ # Extract text from PDF
690
+ pdf_reader = PdfReader(uploaded_paper)
691
+ text = ""
692
+ for page in pdf_reader.pages:
693
+ text += page.extract_text() or ""
694
+
695
+ # Split text into chunks
696
+ text_splitter = RecursiveCharacterTextSplitter(
697
+ chunk_size=1000,
698
+ chunk_overlap=200,
699
+ length_function=len
700
+ )
701
+ chunks = text_splitter.split_text(text)
702
+
703
+ # Create embeddings (no HF token required)
704
+ embeddings = HuggingFaceEmbeddings(
705
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
706
+ model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
707
+ )
708
+
709
+ # Create vector store
710
+ vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)
711
+
712
+ # Initialize Groq LLM for LangChain
713
+ llm = LangChainGroq(
714
+ model_name="llama-3.3-70b-versatile",
715
+ groq_api_key=os.getenv("GROQ_API_KEY"),
716
+ temperature=0.5,
717
+ max_tokens=512
718
+ )
719
+
720
+ # Create conversation chain
721
+ memory = ConversationBufferMemory(
722
+ memory_key='chat_history',
723
+ return_messages=True
724
+ )
725
+
726
+ conversation_chain = ConversationalRetrievalChain.from_llm(
727
+ llm=llm,
728
+ retriever=vectorstore.as_retriever(),
729
+ memory=memory
730
+ )
731
+
732
+ return text, chunks, conversation_chain
733
+
734
+ except Exception as e:
735
+ st.error(f"Error processing paper: {str(e)}")
736
+ return None, None, None
737
+
738
+ def analyze_research_paper():
739
+ st.header("Analyze Research Paper")
740
+ st.write("Upload a research paper (PDF format) to analyze and generate possible code implementations based on the paper's content.")
741
+
742
+ # Add installation instructions
743
+ with st.expander("Setup Instructions"):
744
+ st.write("""
745
+ Before using this feature, please install the required packages:
746
+ ```bash
747
+ pip install PyPDF2 langchain langchain-community faiss-cpu sentence-transformers torch
748
+ """)
749
+
750
+ uploaded_paper = st.file_uploader("Upload Research Paper (PDF)", type="pdf")
751
+ if uploaded_paper:
752
+ try:
753
+ text, chunks, conversation_chain = process_paper_with_rag(uploaded_paper)
754
+
755
+ if text and chunks and conversation_chain:
756
+ st.success("Research paper processed successfully!")
757
+
758
+ # Show paper chunks
759
+ with st.expander("View Paper Chunks"):
760
+ for i, chunk in enumerate(chunks):
761
+ st.write(f"Chunk {i+1}:")
762
+ st.text(chunk)
763
+
764
+ if st.button("Generate The Possible Code of the Paper"):
765
+ with st.spinner("Analyzing paper and generating code..."):
766
+ # Use conversation chain to generate code
767
+ response = conversation_chain({"question": "Based on this research paper, generate a detailed Python implementation of the main algorithms and methods described. Include all necessary imports and ensure the code is well-structured."})
768
+
769
+ generated_code = response['answer']
770
+
771
+ st.subheader("Generated Code")
772
+ st.code(generated_code, language="python")
773
+
774
+ # Allow users to download the generated code
775
+ txt_bytes = BytesIO()
776
+ txt_bytes.write(generated_code.encode())
777
+ txt_bytes.seek(0)
778
+ st.download_button(
779
+ label="Download Code as TXT",
780
+ data=txt_bytes,
781
+ file_name="generated_code.txt",
782
+ mime="text/plain"
783
+ )
784
+
785
+ # Store conversation in session state
786
+ if 'chat_history' not in st.session_state:
787
+ st.session_state.chat_history = []
788
+ st.session_state.chat_history.append(("user", "Generate code implementation"))
789
+ st.session_state.chat_history.append(("assistant", generated_code))
790
+
791
+ # Add follow-up questions section
792
+ st.subheader("Ask Questions About the Implementation")
793
+ user_question = st.text_input("Enter your question about the paper or implementation:")
794
+ if user_question and st.button("Ask"):
795
+ with st.spinner("Generating response..."):
796
+ response = conversation_chain({"question": user_question})
797
+ st.write("Response:", response['answer'])
798
+ st.session_state.chat_history.append(("user", user_question))
799
+ st.session_state.chat_history.append(("assistant", response['answer']))
800
+
801
+ except Exception as e:
802
+ st.error(f"Error processing the research paper: {str(e)}")
803
+ st.write("Please make sure you have installed all required packages:")
804
+ st.code("pip install PyPDF2 langchain langchain-community faiss-cpu sentence-transformers torch")
805
+ else:
806
+ st.info("Upload a research paper to get started.")
807
+
808
+ # Main app layout
809
+ add_custom_styles()
810
+ st.title("")
811
+ add_header()
812
+
813
+ tab1, tab2, tab3, tab4 = st.tabs(["Dataset Generator", "Example Prompts", "Chat with Dataset", "Analyze Research Paper"])
814
+
815
+ with tab1:
816
+ st.header("Generate Synthetic Datasets")
817
+ st.write("Enter a prompt to generate a synthetic dataset. Be as descriptive as possible (e.g., 'Generate 500 rows for heart risk prediction with age, common symptoms like chest pain and shortness of breath, and a risk level (yes/no)'). For more examples, check the 'Example Prompts' tab.")
818
+ prompt = st.text_area("Your prompt:", height=100)
819
+
820
+ if "generated_code" not in st.session_state:
821
+ st.session_state.generated_code = None
822
+ st.session_state.expected_rows = None
823
+
824
+ if st.button("Generate Code"):
825
+ if prompt:
826
+ code = generate_dataset_code(prompt)
827
+ if code:
828
+ st.session_state.generated_code = code
829
+ st.session_state.expected_rows = extract_row_count(prompt)
830
+ st.subheader("Generated Python Code")
831
+ st.code(code, language="python")
832
+ st.info("Review the code and click 'Get the Dataset'.")
833
+ else:
834
+ st.error("Generated code does not define create_dataset function correctly.")
835
+ else:
836
+ st.warning("Enter a prompt.")
837
+
838
+ if st.session_state.generated_code and st.button("Get the Dataset"):
839
+ df = execute_code(st.session_state.generated_code)
840
+ if df is not None:
841
+ if len(df) != st.session_state.expected_rows:
842
+ st.warning(f"Dataset has {len(df)} rows; requested {st.session_state.expected_rows}.")
843
+ st.subheader("Generated Dataset")
844
+ st.write(f"Rows: {len(df)}, Columns: {', '.join(df.columns)}")
845
+ st.dataframe(df.head())
846
+ csv_bytes = to_csv_bytes(df)
847
+ st.download_button(label="Download CSV", data=csv_bytes, file_name="datagenie_dataset.csv", mime="text/csv")
848
+
849
+ with tab2:
850
+ st.header("Example Prompts")
851
+ st.write("Explore example prompts to generate synthetic datasets for various domains.")
852
+ st.subheader("๐Ÿ’ผ Finance & Business")
853
+ st.write("Generate 1000 customer records for a bank with age, income, loan amount, credit score, and defaulted (Yes/No).")
854
+ st.write("Create 500 rows of sales data with product category, region, sales amount, profit margin, and sales channel (Online/Offline).")
855
+ st.write("Generate 200 rows of stock market data with date, opening price, closing price, highest price, lowest price, and trading volume.")
856
+
857
+ st.subheader("๐Ÿง‘โ€๐ŸŽ“ Education")
858
+ st.write("Create 700 student records with study hours, attendance, and final grade (A, B, C, D, F).")
859
+ st.write("Generate 300 rows of teacher performance data with years of experience, subject taught, average student score, and teacher rating (1-5).")
860
+ st.write("Generate 1000 rows of university admission data with applicant age, GPA, SAT score, extracurricular activities, and admission status (Accepted/Rejected).")
861
+
862
+ st.subheader("๐ŸŒ Environment")
863
+ st.write("Generate 365 days of air quality data with PM2.5, PM10, CO2, and air quality (Good, Moderate, Hazardous).")
864
+ st.write("Create 500 rows of weather data with date, temperature, humidity, wind speed, and precipitation level.")
865
+ st.write("Generate 1000 rows of energy consumption data with household size, monthly usage (kWh), energy source (Solar, Wind, Grid), and cost.")
866
+
867
+ st.subheader("๐Ÿฅ Healthcare")
868
+ st.write("Generate 1000 patient records with age, gender, blood pressure, cholesterol level, and diagnosis (Healthy, At Risk, Critical).")
869
+ st.write("Create 500 rows of hospital data with department, number of patients, average treatment cost, and satisfaction rating (1-5).")
870
+ st.write("Generate 300 rows of clinical trial data with participant ID, age, treatment type, side effects (Yes/No), and outcome (Improved/Unchanged/Worsened).")
871
+
872
+ st.subheader("๐Ÿš— Transportation")
873
+ st.write("Generate 1000 rows of vehicle data with make, model, year, fuel efficiency (mpg), and price.")
874
+ st.write("Create 500 rows of traffic data with date, time, location, number of vehicles, and average speed.")
875
+ st.write("Generate 300 rows of ride-sharing data with driver ID, trip distance, trip duration, fare amount, and rating (1-5).")
876
+
877
+ st.subheader("๐Ÿ›’ Retail & E-commerce")
878
+ st.write("Generate 1000 rows of customer purchase data with customer ID, product category, purchase amount, and payment method (Credit Card, PayPal, Cash).")
879
+ st.write("Create 500 rows of inventory data with product ID, category, stock level, reorder point, and supplier.")
880
+ st.write("Generate 300 rows of website analytics data with date, page views, unique visitors, bounce rate, and conversion rate.")
881
+
882
+ st.subheader("๐Ÿ—๏ธ Construction & Real Estate")
883
+ st.write("Generate 500 rows of real estate data with property type, location, size (sq ft), price, and status (Available/Sold).")
884
+ st.write("Create 300 rows of construction project data with project ID, start date, end date, budget, and completion status (On Track/Delayed).")
885
+ st.write("Generate 200 rows of rental data with property type, monthly rent, tenant age, and lease duration (months).")
886
+
887
+ st.subheader("๐ŸŽฎ Gaming & Entertainment")
888
+ st.write("Generate 1000 rows of gaming data with player ID, game title, hours played, in-game purchases, and player rank.")
889
+ st.write("Create 500 rows of movie data with title, genre, release year, box office revenue, and IMDb rating.")
890
+ st.write("Generate 300 rows of music streaming data with user ID, song title, artist, play count, and duration (minutes).")
891
+
892
+ with tab3:
893
+ st.header("Chat with Dataset")
894
+ uploaded_file = st.file_uploader("Upload CSV for Chatting", type="csv")
895
+ if uploaded_file:
896
+ try:
897
+ df = pd.read_csv(uploaded_file)
898
+ st.success("File uploaded successfully!")
899
+ chat_with_dataset(df)
900
+ except Exception as e:
901
+ st.error(f"Error loading CSV file: {str(e)}")
902
+ else:
903
+ st.info("Upload a CSV file to start chatting.")
904
+
905
+ with tab4:
906
+ analyze_research_paper()
907
+
908
+ add_footer()
909
+
910
+ # Sidebar for data processing and visualization
911
+ add_sidebar()
912
+ feature_options = st.sidebar.radio("Select Option", ["Dataset Overview", "Clean Data", "Detect Outlier", "Encoder",
913
+ "Data Transformer", "Data Analysis", "Feature Importance Analyzer",
914
+ "Best Parameter Selector", "Select ML Models", "Clear Modified Dataset",
915
+ "Visualizations"])
916
+
917
+ if 'uploaded_df' in st.session_state:
918
+ df = st.session_state['uploaded_df']
919
+ try:
920
+ if feature_options == "Dataset Overview":
921
+ dataset_overview(df)
922
+ elif feature_options == "Clean Data":
923
+ st.session_state['uploaded_df'] = clean_data(df)
924
+ elif feature_options == "Detect Outlier":
925
+ detect_outlier(df)
926
+ elif feature_options == "Encoder":
927
+ st.session_state['uploaded_df'] = encoder(df)
928
+ elif feature_options == "Data Transformer":
929
+ st.session_state['uploaded_df'] = data_transformer(df)
930
+ elif feature_options == "Data Analysis":
931
+ data_analysis(df)
932
+ elif feature_options == "Feature Importance Analyzer":
933
+ feature_importance_analyzer(df)
934
+ elif feature_options == "Best Parameter Selector":
935
+ best_parameter_selector(df)
936
+ elif feature_options == "Select ML Models":
937
+ select_ml_models(df)
938
+ elif feature_options == "Clear Modified Dataset":
939
+ clear_modified_dataset()
940
+ elif feature_options == "Visualizations":
941
+ visualize_dataset(df)
942
+ features = st.sidebar.multiselect("Select features for specific visualizations", df.columns.tolist())
943
+ if features:
944
+ visualize_specific_features(df, features)
945
+
946
+ if 'uploaded_df' in st.session_state:
947
+ df = st.session_state['uploaded_df']
948
+ except Exception as e:
949
+ st.error(f"Error processing dataset: {str(e)}")
950
+ else:
951
+ st.sidebar.info("Upload a CSV to proceed.")
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ faker
4
+ groq
5
+ matplotlib
6
+ seaborn
7
+ plotly
8
+ scikit-learn
9
+ PyPDF2
10
+ langchain
11
+ langchain-community
12
+ faiss-cpu
13
+ sentence-transformers
14
+ torch
15
+ numpy