vashu2425 commited on
Commit
4589c40
·
verified ·
1 Parent(s): 34f3676

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2006 -77
app.py CHANGED
@@ -16,6 +16,9 @@ import time
16
  import logging
17
  import plotly.express as px
18
  import numpy as np
 
 
 
19
 
20
  # Import local modules
21
  from eda_analysis import DatasetAnalyzer
@@ -79,6 +82,13 @@ def initialize_session_state():
79
  if "chat_history" not in st.session_state:
80
  st.session_state.chat_history = []
81
 
 
 
 
 
 
 
 
82
  # For dataframe and related variables, ensure proper initialization
83
  # df should not be in session_state until a proper DataFrame is loaded
84
  if "descriptive_stats" not in st.session_state:
@@ -136,34 +146,17 @@ def apply_custom_css():
136
  font-weight: 600;
137
  text-align: center;
138
  }
139
-
140
-
141
- /*
142
- div[data-testid="stBottomBlockContainer"] {
143
  background-color: #111827 !important;
144
- }
145
-
146
- div[data-testid="stChatInput"]{
147
- background-color: #111827 !important;
148
- } */
149
-
150
- /* Override the bottom chat input container */
151
- div.stChatFloatingInputContainer {
152
  background-color: #111827 !important;
153
- }
154
 
155
- /* Override the inner chat input box */
156
- div.stChatInputContainer {
157
- background-color: #111827 !important;
158
 
159
- }
160
 
161
- /* Optional: Override text area background */
162
- textarea {
163
- background-color: #111827 !important;
164
- color: white !important;
165
- }
166
-
167
  .sidebar-section {
168
  background: rgba(31, 41, 55, 0.4);
169
  border-radius: 8px;
@@ -907,7 +900,7 @@ def generate_ai_insights():
907
 
908
  # Check for date columns
909
  date_cols = []
910
- for col in df.columns.tolist():
911
  if df[col].dtype == 'object':
912
  try:
913
  pd.to_datetime(df[col])
@@ -982,6 +975,20 @@ def display_chat_interface():
982
  st.markdown('</div>', unsafe_allow_html=True)
983
  return
984
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
985
  # Display chat history
986
  for message in st.session_state.chat_history:
987
  if message["role"] == "user":
@@ -1161,48 +1168,25 @@ def display_distribution_tab():
1161
  key="chart_type_select"
1162
  )
1163
 
1164
- # with col2:
1165
- # if chart_type != "Distribution Plot":
1166
- # column_type = "Numerical" if chart_type in ["Histogram", "Box Plot", "Violin Plot"] else "Categorical"
1167
- # columns_to_show = df.select_dtypes(include=['number']).columns.tolist() if column_type == "Numerical" else df.select_dtypes(include=['object', 'category']).columns.tolist()
1168
-
1169
- # selected_columns = st.multiselect(
1170
- # f"Select {column_type} Columns to Visualize",
1171
- # options=columns_to_show,
1172
- # default=columns_to_show[:min(3, len(columns_to_show))],
1173
- # key="column_select"
1174
- # )
1175
- # else:
1176
- # num_cols = df.select_dtypes(include=['number']).columns.tolist()
1177
- # selected_columns = st.multiselect(
1178
- # "Select Numerical Columns",
1179
- # options=num_cols,
1180
- # default=num_cols[:min(3, len(num_cols))],
1181
- # key="column_select"
1182
- # )
1183
-
1184
-
1185
-
1186
  with col2:
1187
  if chart_type != "Distribution Plot":
1188
  column_type = "Numerical" if chart_type in ["Histogram", "Box Plot", "Violin Plot"] else "Categorical"
1189
- columns_to_show = list(df.select_dtypes(include=['number']).columns) if column_type == "Numerical" else list(df.select_dtypes(include=['object', 'category']).columns)
1190
 
1191
  selected_columns = st.multiselect(
1192
  f"Select {column_type} Columns to Visualize",
1193
  options=columns_to_show,
1194
- default=list(columns_to_show[:min(3, len(columns_to_show))]), # Convert to list ✅
1195
  key="column_select"
1196
  )
1197
  else:
1198
- num_cols = list(df.select_dtypes(include=['number']).columns) # Convert to list ✅
1199
  selected_columns = st.multiselect(
1200
  "Select Numerical Columns",
1201
  options=num_cols,
1202
- default=list(num_cols[:min(3, len(num_cols))]), # Convert to list ✅
1203
  key="column_select"
1204
  )
1205
-
1206
  st.markdown('</div>', unsafe_allow_html=True)
1207
 
1208
  # Display selected charts
@@ -1494,15 +1478,15 @@ def display_welcome_page():
1494
  5. **Transform** your features based on recommendations
1495
  """)
1496
 
1497
- # # Powered by section
1498
- # st.subheader("Powered by")
1499
- # cols = st.columns(3)
1500
- # with cols[0]:
1501
- # st.markdown("**llama3-8b-8192**")
1502
- # with cols[1]:
1503
- # st.markdown("**Groq API**")
1504
- # with cols[2]:
1505
- # st.markdown("**Streamlit**")
1506
 
1507
  # Upload prompt
1508
  st.info("👈 Please upload a CSV file using the sidebar to get started")
@@ -1580,22 +1564,11 @@ def display_relationships_tab():
1580
  # Scatter plot matrix
1581
  st.subheader("Scatter Plot Matrix")
1582
 
1583
- # # Let user choose columns
1584
- # selected_cols = st.multiselect(
1585
- # "Select columns for scatter plot matrix (max 5 recommended)",
1586
- # options=num_cols,
1587
- # default=num_cols[:min(4, len(num_cols))]
1588
- # )
1589
-
1590
-
1591
- # Convert num_cols to a list before using it in multiselect
1592
- num_cols = list(df.select_dtypes(include=['number']).columns)
1593
-
1594
- # Ensure default selection is also a list
1595
  selected_cols = st.multiselect(
1596
  "Select columns for scatter plot matrix (max 5 recommended)",
1597
  options=num_cols,
1598
- default=list(num_cols[:min(4, len(num_cols))]) # Convert to list ✅
1599
  )
1600
 
1601
  if selected_cols:
@@ -1691,9 +1664,27 @@ def process_chat_message(user_message):
1691
  "sample_data": df.head(5).to_string()
1692
  }
1693
 
1694
- # Generate response using LLM
1695
- logger.info(f"Sending question to LLM: {user_message}")
1696
- response = llm_inference.answer_dataset_question(user_message, dataset_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1697
 
1698
  # Log the raw response for debugging
1699
  logger.info(f"Raw LLM response: {response[:100]}...")
@@ -1853,7 +1844,7 @@ def main():
1853
 
1854
  st.markdown('</div>', unsafe_allow_html=True)
1855
 
1856
- # st.markdown('<div class="sidebar-footer">Powered by Hugging Face & Streamlit</div>', unsafe_allow_html=True)
1857
 
1858
  # If data is uploaded, process it
1859
  if uploaded_file is not None and ('df' not in st.session_state or st.session_state.get('df') is None):
@@ -1924,3 +1915,1941 @@ def main():
1924
 
1925
  if __name__ == "__main__":
1926
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  import logging
17
  import plotly.express as px
18
  import numpy as np
19
+ # Import LangChain memory components
20
+ from langchain.memory import ConversationBufferMemory
21
+ from langchain_core.messages import AIMessage, HumanMessage
22
 
23
  # Import local modules
24
  from eda_analysis import DatasetAnalyzer
 
82
  if "chat_history" not in st.session_state:
83
  st.session_state.chat_history = []
84
 
85
+ # Initialize conversation memory for LangChain
86
+ if "conversation_memory" not in st.session_state:
87
+ st.session_state.conversation_memory = ConversationBufferMemory(
88
+ memory_key="chat_history",
89
+ return_messages=True
90
+ )
91
+
92
  # For dataframe and related variables, ensure proper initialization
93
  # df should not be in session_state until a proper DataFrame is loaded
94
  if "descriptive_stats" not in st.session_state:
 
146
  font-weight: 600;
147
  text-align: center;
148
  }
149
+ /* Try to force change
150
+ div[class^="st-emotion-cache"] {
 
 
151
  background-color: #111827 !important;
152
+ }*/
153
+
154
+ div[data-testid="stBottomBlockContainer"] {
 
 
 
 
 
155
  background-color: #111827 !important;
156
+ }
157
 
 
 
 
158
 
 
159
 
 
 
 
 
 
 
160
  .sidebar-section {
161
  background: rgba(31, 41, 55, 0.4);
162
  border-radius: 8px;
 
900
 
901
  # Check for date columns
902
  date_cols = []
903
+ for col in df.columns:
904
  if df[col].dtype == 'object':
905
  try:
906
  pd.to_datetime(df[col])
 
975
  st.markdown('</div>', unsafe_allow_html=True)
976
  return
977
 
978
+ # Add a button to clear chat history
979
+ col1, col2 = st.columns([4, 1])
980
+ with col2:
981
+ if st.button("Clear Chat", key="clear_chat"):
982
+ st.session_state.chat_history = []
983
+ # Reset conversation memory
984
+ if "conversation_memory" in st.session_state:
985
+ st.session_state.conversation_memory = ConversationBufferMemory(
986
+ memory_key="chat_history",
987
+ return_messages=True
988
+ )
989
+ logger.info("Chat history and memory cleared")
990
+ st.rerun()
991
+
992
  # Display chat history
993
  for message in st.session_state.chat_history:
994
  if message["role"] == "user":
 
1168
  key="chart_type_select"
1169
  )
1170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1171
  with col2:
1172
  if chart_type != "Distribution Plot":
1173
  column_type = "Numerical" if chart_type in ["Histogram", "Box Plot", "Violin Plot"] else "Categorical"
1174
+ columns_to_show = df.select_dtypes(include=['number']).columns.tolist() if column_type == "Numerical" else df.select_dtypes(include=['object', 'category']).columns.tolist()
1175
 
1176
  selected_columns = st.multiselect(
1177
  f"Select {column_type} Columns to Visualize",
1178
  options=columns_to_show,
1179
+ default=columns_to_show[:min(3, len(columns_to_show))],
1180
  key="column_select"
1181
  )
1182
  else:
1183
+ num_cols = df.select_dtypes(include=['number']).columns.tolist()
1184
  selected_columns = st.multiselect(
1185
  "Select Numerical Columns",
1186
  options=num_cols,
1187
+ default=num_cols[:min(3, len(num_cols))],
1188
  key="column_select"
1189
  )
 
1190
  st.markdown('</div>', unsafe_allow_html=True)
1191
 
1192
  # Display selected charts
 
1478
  5. **Transform** your features based on recommendations
1479
  """)
1480
 
1481
+ # Powered by section
1482
+ st.subheader("Powered by")
1483
+ cols = st.columns(3)
1484
+ with cols[0]:
1485
+ st.markdown("**llama3-8b-8192**")
1486
+ with cols[1]:
1487
+ st.markdown("**Groq API**")
1488
+ with cols[2]:
1489
+ st.markdown("**Streamlit**")
1490
 
1491
  # Upload prompt
1492
  st.info("👈 Please upload a CSV file using the sidebar to get started")
 
1564
  # Scatter plot matrix
1565
  st.subheader("Scatter Plot Matrix")
1566
 
1567
+ # Let user choose columns
 
 
 
 
 
 
 
 
 
 
 
1568
  selected_cols = st.multiselect(
1569
  "Select columns for scatter plot matrix (max 5 recommended)",
1570
  options=num_cols,
1571
+ default=num_cols[:min(4, len(num_cols))]
1572
  )
1573
 
1574
  if selected_cols:
 
1664
  "sample_data": df.head(5).to_string()
1665
  }
1666
 
1667
+ # Generate response using LLM with memory
1668
+ logger.info(f"Sending question to LLM with memory: {user_message}")
1669
+
1670
+ # Convert chat history to LangChain format for the memory object if needed
1671
+ if len(st.session_state.chat_history) > 1 and "conversation_memory" in st.session_state:
1672
+ # Use the memory-enabled version to maintain conversation context
1673
+ response = llm_inference.answer_with_memory(
1674
+ user_message,
1675
+ dataset_info,
1676
+ st.session_state.conversation_memory
1677
+ )
1678
+ else:
1679
+ # If it's the first message, just use the regular question answering
1680
+ response = llm_inference.answer_dataset_question(user_message, dataset_info)
1681
+
1682
+ # Initialize the memory with this first exchange
1683
+ if "conversation_memory" in st.session_state:
1684
+ st.session_state.conversation_memory.save_context(
1685
+ {"input": user_message},
1686
+ {"output": response}
1687
+ )
1688
 
1689
  # Log the raw response for debugging
1690
  logger.info(f"Raw LLM response: {response[:100]}...")
 
1844
 
1845
  st.markdown('</div>', unsafe_allow_html=True)
1846
 
1847
+ st.markdown('<div class="sidebar-footer">Powered by Hugging Face & Streamlit</div>', unsafe_allow_html=True)
1848
 
1849
  # If data is uploaded, process it
1850
  if uploaded_file is not None and ('df' not in st.session_state or st.session_state.get('df') is None):
 
1915
 
1916
  if __name__ == "__main__":
1917
  main()
1918
+
1919
+
1920
+
1921
+
1922
+
1923
+
1924
+
1925
+
1926
+
1927
+
1928
+
1929
+
1930
+ # """
1931
+ # AI-Powered EDA & Feature Engineering Assistant
1932
+
1933
+ # This application enables users to upload a CSV dataset, and utilizes LLMs to analyze
1934
+ # the dataset to provide EDA and feature engineering recommendations.
1935
+ # """
1936
+
1937
+ # import streamlit as st
1938
+ # import pandas as pd
1939
+ # import os
1940
+ # import base64
1941
+ # from io import BytesIO
1942
+ # from dotenv import load_dotenv
1943
+ # from typing import Dict, List, Any, Optional
1944
+ # import time
1945
+ # import logging
1946
+ # import plotly.express as px
1947
+ # import numpy as np
1948
+
1949
+ # # Import local modules
1950
+ # from eda_analysis import DatasetAnalyzer
1951
+ # from llm_inference import LLMInference
1952
+
1953
+ # # Configure logging
1954
+ # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
1955
+ # logger = logging.getLogger(__name__)
1956
+
1957
+ # # Load environment variables
1958
+ # load_dotenv()
1959
+
1960
+ # # Set page configuration - must be the first Streamlit command
1961
+ # st.set_page_config(
1962
+ # page_title="AI-Powered EDA & Feature Engineering Assistant",
1963
+ # page_icon="📊",
1964
+ # layout="wide",
1965
+ # initial_sidebar_state="expanded"
1966
+ # )
1967
+
1968
+ # # Initialize our classes
1969
+ # @st.cache_resource
1970
+ # def get_llm_inference():
1971
+ # try:
1972
+ # return LLMInference()
1973
+ # except Exception as e:
1974
+ # st.error(f"Error initializing LLM inference: {str(e)}")
1975
+ # return None
1976
+
1977
+ # llm_inference = get_llm_inference()
1978
+
1979
+ # # Session state initialization
1980
+ # if "dataset_analyzer" not in st.session_state:
1981
+ # st.session_state.dataset_analyzer = DatasetAnalyzer()
1982
+
1983
+ # if "dataset_loaded" not in st.session_state:
1984
+ # st.session_state.dataset_loaded = False
1985
+
1986
+ # if "dataset_info" not in st.session_state:
1987
+ # st.session_state.dataset_info = {}
1988
+
1989
+ # if "visualizations" not in st.session_state:
1990
+ # st.session_state.visualizations = {}
1991
+
1992
+ # if "eda_insights" not in st.session_state:
1993
+ # st.session_state.eda_insights = ""
1994
+
1995
+ # if "feature_engineering_recommendations" not in st.session_state:
1996
+ # st.session_state.feature_engineering_recommendations = ""
1997
+
1998
+ # if "data_quality_insights" not in st.session_state:
1999
+ # st.session_state.data_quality_insights = ""
2000
+
2001
+ # if "active_tab" not in st.session_state:
2002
+ # st.session_state.active_tab = "welcome"
2003
+
2004
+ # # Add new functions to support the updated UI
2005
+ # def initialize_session_state():
2006
+ # """Initialize session state variables needed for the application"""
2007
+ # # Initialize session variables with appropriate defaults
2008
+ # if "chat_history" not in st.session_state:
2009
+ # st.session_state.chat_history = []
2010
+
2011
+ # # For dataframe and related variables, ensure proper initialization
2012
+ # # df should not be in session_state until a proper DataFrame is loaded
2013
+ # if "descriptive_stats" not in st.session_state:
2014
+ # st.session_state.descriptive_stats = None
2015
+
2016
+ # if "selected_columns" not in st.session_state:
2017
+ # st.session_state.selected_columns = []
2018
+
2019
+ # if "filtered_df" not in st.session_state:
2020
+ # st.session_state.filtered_df = None
2021
+
2022
+ # if "ai_insights" not in st.session_state:
2023
+ # st.session_state.ai_insights = None
2024
+
2025
+ # if "loading_insights" not in st.session_state:
2026
+ # st.session_state.loading_insights = False
2027
+
2028
+ # if "selected_tab" not in st.session_state:
2029
+ # st.session_state.selected_tab = 'tab-overview'
2030
+
2031
+ # if "dataset_name" not in st.session_state:
2032
+ # st.session_state.dataset_name = ""
2033
+
2034
+ # # Logging initialization
2035
+ # logger.info("Session state initialized")
2036
+
2037
+ # def apply_custom_css():
2038
+ # """Apply additional custom CSS that's not already in the main CSS block"""
2039
+ # st.markdown("""
2040
+ # <style>
2041
+ # /* Base theme variables */
2042
+ # :root {
2043
+ # --primary: #4F46E5;
2044
+ # --secondary: #06B6D4;
2045
+ # --text-light: #F3F4F6;
2046
+ # --text-muted: #9CA3AF;
2047
+ # --bg-card: rgba(31, 41, 55, 0.7);
2048
+ # --bg-dark: #111827;
2049
+ # }
2050
+
2051
+ # /* Global styles */
2052
+ # .stApp {
2053
+ # background-color: var(--bg-dark);
2054
+ # color: var(--text-light);
2055
+ # }
2056
+
2057
+ # /* Improve sidebar styling */
2058
+ # .sidebar-header {
2059
+ # background: linear-gradient(90deg, var(--primary), var(--secondary));
2060
+ # color: white;
2061
+ # padding: 1rem;
2062
+ # border-radius: 8px;
2063
+ # margin-bottom: 1.5rem;
2064
+ # font-size: 1.2rem;
2065
+ # font-weight: 600;
2066
+ # text-align: center;
2067
+ # }
2068
+
2069
+
2070
+ # /*
2071
+ # div[data-testid="stBottomBlockContainer"] {
2072
+ # background-color: #111827 !important;
2073
+ # }
2074
+
2075
+ # div[data-testid="stChatInput"]{
2076
+ # background-color: #111827 !important;
2077
+ # } */
2078
+
2079
+ # /* Override the bottom chat input container */
2080
+ # div.stChatFloatingInputContainer {
2081
+ # background-color: #111827 !important;
2082
+ # }
2083
+
2084
+ # /* Override the inner chat input box */
2085
+ # div.stChatInputContainer {
2086
+ # background-color: #111827 !important;
2087
+
2088
+ # }
2089
+
2090
+ # /* Optional: Override text area background */
2091
+ # textarea {
2092
+ # background-color: #111827 !important;
2093
+ # color: white !important;
2094
+ # }
2095
+
2096
+ # .sidebar-section {
2097
+ # background: rgba(31, 41, 55, 0.4);
2098
+ # border-radius: 8px;
2099
+ # padding: 1rem;
2100
+ # margin-bottom: 1.5rem;
2101
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2102
+ # }
2103
+
2104
+ # .sidebar-footer {
2105
+ # text-align: center;
2106
+ # padding: 1rem;
2107
+ # font-size: 0.8rem;
2108
+ # color: var(--text-muted);
2109
+ # margin-top: 3rem;
2110
+ # }
2111
+
2112
+ # /* Feature Engineering Cards */
2113
+ # .fe-cards-container {
2114
+ # display: grid;
2115
+ # grid-template-columns: repeat(2, 1fr);
2116
+ # gap: 0.8rem;
2117
+ # margin-top: 1rem;
2118
+ # }
2119
+
2120
+ # .fe-card {
2121
+ # background: rgba(31, 41, 55, 0.6);
2122
+ # border-radius: 8px;
2123
+ # padding: 0.8rem;
2124
+ # text-align: center;
2125
+ # cursor: pointer;
2126
+ # transition: all 0.2s ease;
2127
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2128
+ # position: relative;
2129
+ # overflow: hidden;
2130
+ # }
2131
+
2132
+ # .fe-card::before {
2133
+ # content: '';
2134
+ # position: absolute;
2135
+ # top: 0;
2136
+ # left: 0;
2137
+ # right: 0;
2138
+ # bottom: 0;
2139
+ # background: linear-gradient(135deg, var(--primary), var(--secondary));
2140
+ # opacity: 0;
2141
+ # transition: opacity 0.3s ease;
2142
+ # z-index: 0;
2143
+ # }
2144
+
2145
+ # .fe-card:hover::before {
2146
+ # opacity: 0.1;
2147
+ # }
2148
+
2149
+ # .fe-card:hover {
2150
+ # transform: translateY(-2px);
2151
+ # box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
2152
+ # border-color: rgba(99, 102, 241, 0.3);
2153
+ # }
2154
+
2155
+ # .fe-card-active {
2156
+ # border-color: var(--primary);
2157
+ # background: rgba(79, 70, 229, 0.1);
2158
+ # }
2159
+
2160
+ # .fe-card-icon {
2161
+ # font-size: 1.8rem;
2162
+ # margin-bottom: 0.3rem;
2163
+ # position: relative;
2164
+ # z-index: 1;
2165
+ # }
2166
+
2167
+ # .fe-card-title {
2168
+ # font-size: 0.85rem;
2169
+ # font-weight: 600;
2170
+ # color: var(--text-light);
2171
+ # position: relative;
2172
+ # z-index: 1;
2173
+ # }
2174
+
2175
+ # /* Tab content styling */
2176
+ # .tab-title {
2177
+ # font-size: 1.8rem;
2178
+ # margin-bottom: 1.5rem;
2179
+ # position: relative;
2180
+ # display: inline-block;
2181
+ # color: var(--text-light);
2182
+ # }
2183
+
2184
+ # .tab-title:after {
2185
+ # content: '';
2186
+ # position: absolute;
2187
+ # bottom: -10px;
2188
+ # left: 0;
2189
+ # width: 100%;
2190
+ # height: 3px;
2191
+ # background: linear-gradient(90deg, var(--primary) 0%, var(--secondary) 100%);
2192
+ # border-radius: 3px;
2193
+ # }
2194
+
2195
+ # /* Navigation Tabs */
2196
+ # .custom-tabs {
2197
+ # display: flex;
2198
+ # background: rgba(31, 41, 55, 0.6);
2199
+ # border-radius: 12px;
2200
+ # padding: 0.5rem;
2201
+ # margin-bottom: 2rem;
2202
+ # justify-content: space-between;
2203
+ # overflow: hidden;
2204
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2205
+ # }
2206
+
2207
+ # .tab-item {
2208
+ # flex: 1;
2209
+ # text-align: center;
2210
+ # padding: 0.8rem 0.5rem;
2211
+ # border-radius: 8px;
2212
+ # cursor: pointer;
2213
+ # transition: all 0.3s ease;
2214
+ # position: relative;
2215
+ # z-index: 1;
2216
+ # margin: 0 0.2rem;
2217
+ # }
2218
+
2219
+ # .tab-item.active {
2220
+ # background: rgba(79, 70, 229, 0.1);
2221
+ # }
2222
+
2223
+ # .tab-item.active::before {
2224
+ # content: '';
2225
+ # position: absolute;
2226
+ # bottom: 0;
2227
+ # left: 10%;
2228
+ # right: 10%;
2229
+ # height: 3px;
2230
+ # background: linear-gradient(90deg, var(--primary), var(--secondary));
2231
+ # border-radius: 3px;
2232
+ # }
2233
+
2234
+ # .tab-item:hover {
2235
+ # background: rgba(79, 70, 229, 0.05);
2236
+ # }
2237
+
2238
+ # .tab-icon {
2239
+ # font-size: 1.5rem;
2240
+ # margin-bottom: 0.3rem;
2241
+ # }
2242
+
2243
+ # .tab-label {
2244
+ # font-size: 0.85rem;
2245
+ # font-weight: 500;
2246
+ # color: var(--text-light);
2247
+ # }
2248
+
2249
+ # .tab-content-spacer {
2250
+ # height: 1rem;
2251
+ # }
2252
+
2253
+ # /* Card styling */
2254
+ # .stats-card, .info-card, .chart-card {
2255
+ # background: rgba(31, 41, 55, 0.3);
2256
+ # border-radius: 10px;
2257
+ # padding: 1.2rem;
2258
+ # margin-bottom: 1.5rem;
2259
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2260
+ # transition: all 0.3s ease;
2261
+ # }
2262
+
2263
+ # .stats-card:hover, .info-card:hover, .chart-card:hover {
2264
+ # transform: translateY(-5px);
2265
+ # box-shadow: 0 8px 15px rgba(0, 0, 0, 0.2);
2266
+ # border-color: rgba(99, 102, 241, 0.3);
2267
+ # }
2268
+
2269
+ # /* Dataset stats styling */
2270
+ # .dataset-stats {
2271
+ # display: flex;
2272
+ # flex-wrap: wrap;
2273
+ # gap: 0.8rem;
2274
+ # justify-content: center;
2275
+ # }
2276
+
2277
+ # .stat-item {
2278
+ # text-align: center;
2279
+ # padding: 0.8rem;
2280
+ # background: rgba(31, 41, 55, 0.6);
2281
+ # border-radius: 8px;
2282
+ # min-width: 80px;
2283
+ # border: 1px solid rgba(99, 102, 241, 0.2);
2284
+ # }
2285
+
2286
+ # .stat-value {
2287
+ # font-size: 1.5rem;
2288
+ # font-weight: 700;
2289
+ # color: var(--primary);
2290
+ # }
2291
+
2292
+ # .stat-label {
2293
+ # font-size: 0.8rem;
2294
+ # color: var(--text-muted);
2295
+ # margin-top: 0.3rem;
2296
+ # }
2297
+
2298
+ # /* Chart styling */
2299
+ # .chart-container {
2300
+ # margin-top: 1.5rem;
2301
+ # }
2302
+
2303
+ # .chart-card h3 {
2304
+ # font-size: 1.2rem;
2305
+ # margin-bottom: 1rem;
2306
+ # color: var(--text-light);
2307
+ # }
2308
+
2309
+ # .stat-summary {
2310
+ # display: grid;
2311
+ # grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
2312
+ # gap: 0.5rem;
2313
+ # margin-top: 1rem;
2314
+ # }
2315
+
2316
+ # .stat-pair {
2317
+ # display: flex;
2318
+ # justify-content: space-between;
2319
+ # padding: 0.3rem 0.5rem;
2320
+ # background: rgba(31, 41, 55, 0.4);
2321
+ # border-radius: 4px;
2322
+ # font-size: 0.9rem;
2323
+ # }
2324
+
2325
+ # .stat-pair span {
2326
+ # color: var(--text-muted);
2327
+ # }
2328
+
2329
+ # .stat-pair strong {
2330
+ # color: var(--text-light);
2331
+ # }
2332
+
2333
+ # /* Filter container */
2334
+ # .filter-container {
2335
+ # background: rgba(31, 41, 55, 0.3);
2336
+ # border-radius: 10px;
2337
+ # padding: 1.2rem;
2338
+ # margin-bottom: 1.5rem;
2339
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2340
+ # }
2341
+
2342
+ # /* AI Insights styling */
2343
+ # .insights-container {
2344
+ # margin-top: 1rem;
2345
+ # }
2346
+
2347
+ # .insights-category {
2348
+ # margin-top: 0.5rem;
2349
+ # }
2350
+
2351
+ # .insight-card {
2352
+ # background: rgba(31, 41, 55, 0.3);
2353
+ # border-radius: 10px;
2354
+ # padding: 1.2rem;
2355
+ # margin-bottom: 1rem;
2356
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2357
+ # display: flex;
2358
+ # align-items: flex-start;
2359
+ # }
2360
+
2361
+ # .insight-content {
2362
+ # display: flex;
2363
+ # align-items: flex-start;
2364
+ # gap: 1rem;
2365
+ # }
2366
+
2367
+ # .insight-icon {
2368
+ # font-size: 1.5rem;
2369
+ # margin-top: 0.1rem;
2370
+ # }
2371
+
2372
+ # .insight-text {
2373
+ # flex: 1;
2374
+ # line-height: 1.5;
2375
+ # }
2376
+
2377
+ # .generate-insights-container {
2378
+ # display: flex;
2379
+ # justify-content: center;
2380
+ # align-items: center;
2381
+ # margin: 3rem 0;
2382
+ # }
2383
+
2384
+ # .placeholder-card {
2385
+ # background: rgba(31, 41, 55, 0.3);
2386
+ # border-radius: 15px;
2387
+ # padding: 2rem;
2388
+ # text-align: center;
2389
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2390
+ # max-width: 500px;
2391
+ # margin: 0 auto;
2392
+ # }
2393
+
2394
+ # .placeholder-icon {
2395
+ # font-size: 3rem;
2396
+ # margin-bottom: 1rem;
2397
+ # animation: float 3s ease-in-out infinite;
2398
+ # }
2399
+
2400
+ # .placeholder-text {
2401
+ # color: var(--text-muted);
2402
+ # line-height: 1.6;
2403
+ # margin-bottom: 1.5rem;
2404
+ # }
2405
+
2406
+ # .loading-container {
2407
+ # display: flex;
2408
+ # justify-content: center;
2409
+ # margin: 2rem 0;
2410
+ # }
2411
+
2412
+ # .loading-pulse {
2413
+ # width: 80px;
2414
+ # height: 80px;
2415
+ # border-radius: 50%;
2416
+ # background: linear-gradient(to right, var(--primary), var(--secondary));
2417
+ # animation: pulse-animation 1.5s ease infinite;
2418
+ # }
2419
+
2420
+ # @keyframes pulse-animation {
2421
+ # 0% {
2422
+ # transform: scale(0.6);
2423
+ # opacity: 0.5;
2424
+ # }
2425
+ # 50% {
2426
+ # transform: scale(1);
2427
+ # opacity: 1;
2428
+ # }
2429
+ # 100% {
2430
+ # transform: scale(0.6);
2431
+ # opacity: 0.5;
2432
+ # }
2433
+ # }
2434
+
2435
+ # @keyframes float {
2436
+ # 0% { transform: translateY(0px); }
2437
+ # 50% { transform: translateY(-10px); }
2438
+ # 100% { transform: translateY(0px); }
2439
+ # }
2440
+
2441
+ # /* Button styling */
2442
+ # button[kind="primary"] {
2443
+ # background: linear-gradient(90deg, var(--primary), var(--secondary)) !important;
2444
+ # color: white !important;
2445
+ # border: none !important;
2446
+ # border-radius: 8px !important;
2447
+ # padding: 0.6rem 1.2rem !important;
2448
+ # font-weight: 600 !important;
2449
+ # transition: all 0.3s ease !important;
2450
+ # box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1) !important;
2451
+ # }
2452
+
2453
+ # button[kind="primary"]:hover {
2454
+ # transform: translateY(-2px) !important;
2455
+ # box-shadow: 0 6px 10px rgba(0, 0, 0, 0.15) !important;
2456
+ # }
2457
+
2458
+ # button[kind="secondary"] {
2459
+ # background: rgba(79, 70, 229, 0.1) !important;
2460
+ # color: var(--text-light) !important;
2461
+ # border: 1px solid rgba(79, 70, 229, 0.3) !important;
2462
+ # border-radius: 8px !important;
2463
+ # padding: 0.6rem 1.2rem !important;
2464
+ # font-weight: 600 !important;
2465
+ # transition: all 0.3s ease !important;
2466
+ # }
2467
+
2468
+ # button[kind="secondary"]:hover {
2469
+ # background: rgba(79, 70, 229, 0.2) !important;
2470
+ # transform: translateY(-2px) !important;
2471
+ # }
2472
+
2473
+ # /* Override Streamlit default button styles */
2474
+ # .stButton>button {
2475
+ # background: linear-gradient(90deg, var(--primary), var(--secondary)) !important;
2476
+ # color: white !important;
2477
+ # border: none !important;
2478
+ # border-radius: 8px !important;
2479
+ # padding: 0.6rem 1.2rem !important;
2480
+ # font-weight: 600 !important;
2481
+ # transition: all 0.3s ease !important;
2482
+ # box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1) !important;
2483
+ # width: 100%;
2484
+ # }
2485
+
2486
+ # .stButton>button:hover {
2487
+ # transform: translateY(-2px) !important;
2488
+ # box-shadow: 0 6px 10px rgba(0, 0, 0, 0.15) !important;
2489
+ # }
2490
+
2491
+ # /* Chat interface styling */
2492
+ # .chat-interface-container {
2493
+ # padding: 1rem 0;
2494
+ # margin-bottom: 100px;
2495
+ # position: relative;
2496
+ # }
2497
+
2498
+ # .chat-messages {
2499
+ # display: flex;
2500
+ # flex-direction: column;
2501
+ # gap: 15px;
2502
+ # margin-bottom: 20px;
2503
+ # }
2504
+
2505
+ # .chat-message-user, .chat-message-ai {
2506
+ # padding: 12px 16px;
2507
+ # border-radius: 12px;
2508
+ # max-width: 80%;
2509
+ # box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
2510
+ # }
2511
+
2512
+ # .chat-message-user {
2513
+ # align-self: flex-end;
2514
+ # background: linear-gradient(135deg, var(--primary) 0%, var(--secondary) 100%);
2515
+ # color: white;
2516
+ # border-bottom-right-radius: 0;
2517
+ # margin-left: auto;
2518
+ # }
2519
+
2520
+ # .chat-message-ai {
2521
+ # align-self: flex-start;
2522
+ # background: var(--bg-card);
2523
+ # color: var(--text-light);
2524
+ # border-bottom-left-radius: 0;
2525
+ # margin-right: auto;
2526
+ # }
2527
+
2528
+ # .chat-input-container {
2529
+ # display: flex;
2530
+ # align-items: center;
2531
+ # gap: 10px;
2532
+ # margin-top: 1.5rem;
2533
+ # }
2534
+
2535
+ # .chat-suggestions {
2536
+ # display: flex;
2537
+ # flex-wrap: wrap;
2538
+ # gap: 10px;
2539
+ # margin: 1.5rem 0;
2540
+ # }
2541
+
2542
+ # .chat-suggestion {
2543
+ # background: rgba(99, 102, 241, 0.1);
2544
+ # border: 1px solid rgba(99, 102, 241, 0.3);
2545
+ # border-radius: 30px;
2546
+ # padding: 8px 15px;
2547
+ # font-size: 0.9rem;
2548
+ # color: var(--text-light);
2549
+ # cursor: pointer;
2550
+ # transition: all 0.3s ease;
2551
+ # display: inline-block;
2552
+ # margin-bottom: 8px;
2553
+ # }
2554
+
2555
+ # .chat-suggestion:hover {
2556
+ # background: rgba(99, 102, 241, 0.2);
2557
+ # transform: translateY(-2px);
2558
+ # }
2559
+
2560
+ # /* Expander styling */
2561
+ # .st-expander {
2562
+ # background: rgba(31, 41, 55, 0.2) !important;
2563
+ # border-radius: 8px !important;
2564
+ # margin-bottom: 1rem !important;
2565
+ # border: 1px solid rgba(99, 102, 241, 0.1) !important;
2566
+ # }
2567
+
2568
+ # /* Streamlit widget styling */
2569
+ # div[data-testid="stForm"] {
2570
+ # background: rgba(31, 41, 55, 0.2) !important;
2571
+ # border-radius: 10px !important;
2572
+ # padding: 1rem !important;
2573
+ # border: 1px solid rgba(99, 102, 241, 0.1) !important;
2574
+ # }
2575
+
2576
+ # .stSelectbox>div>div {
2577
+ # background: rgba(31, 41, 55, 0.4) !important;
2578
+ # border: 1px solid rgba(99, 102, 241, 0.2) !important;
2579
+ # border-radius: 8px !important;
2580
+ # }
2581
+
2582
+ # .stTextInput>div>div>input {
2583
+ # background: rgba(31, 41, 55, 0.4) !important;
2584
+ # border: 1px solid rgba(99, 102, 241, 0.2) !important;
2585
+ # border-radius: 8px !important;
2586
+ # color: var(--text-light) !important;
2587
+ # padding: 1rem !important;
2588
+ # }
2589
+
2590
+ # /* Streamlit multiselect dropdown styling */
2591
+ # div[data-baseweb="popover"] {
2592
+ # background: var(--bg-dark) !important;
2593
+ # border: 1px solid rgba(99, 102, 241, 0.2) !important;
2594
+ # border-radius: 8px !important;
2595
+ # }
2596
+
2597
+ # div[data-baseweb="menu"] {
2598
+ # background: var(--bg-dark) !important;
2599
+ # }
2600
+
2601
+ # div[role="listbox"] {
2602
+ # background: var(--bg-dark) !important;
2603
+ # }
2604
+
2605
+ # /* Fix for the upload button */
2606
+ # .stFileUploader > div {
2607
+ # display: flex;
2608
+ # flex-direction: column;
2609
+ # align-items: center;
2610
+ # }
2611
+
2612
+ # .stFileUploader > div > button {
2613
+ # background: linear-gradient(90deg, var(--primary), var(--secondary)) !important;
2614
+ # color: white !important;
2615
+ # border: none !important;
2616
+ # width: 100%;
2617
+ # margin-top: 1rem;
2618
+ # }
2619
+
2620
+ # /* Fix for tab content spacing */
2621
+ # .tab-content {
2622
+ # margin-top: 2rem;
2623
+ # padding: 1rem;
2624
+ # background: rgba(31, 41, 55, 0.2);
2625
+ # border-radius: 10px;
2626
+ # border: 1px solid rgba(99, 102, 241, 0.1);
2627
+ # }
2628
+ # </style>
2629
+ # """, unsafe_allow_html=True)
2630
+
2631
+ # def generate_ai_insights():
2632
+ # """Generate AI-powered insights about the dataset"""
2633
+ # # Make sure we have a dataframe to analyze
2634
+ # if 'df' not in st.session_state:
2635
+ # logger.warning("Cannot generate AI insights: No dataframe in session state")
2636
+ # return {}
2637
+
2638
+ # df = st.session_state.df
2639
+ # insights = {}
2640
+
2641
+ # # Try to use the LLM for insights generation first
2642
+ # try:
2643
+ # if llm_inference is not None:
2644
+ # # Create dataset_info dictionary for LLM
2645
+ # num_rows, num_cols = df.shape
2646
+ # num_numerical = len(df.select_dtypes(include=['number']).columns)
2647
+ # num_categorical = len(df.select_dtypes(include=['object', 'category']).columns)
2648
+ # num_missing = df.isnull().sum().sum()
2649
+
2650
+ # # Format missing values for better readability
2651
+ # missing_cols = df.isnull().sum()[df.isnull().sum() > 0]
2652
+ # missing_values = {}
2653
+ # for col in missing_cols.index:
2654
+ # count = missing_cols[col]
2655
+ # percent = round(count / len(df) * 100, 2)
2656
+ # missing_values[col] = (count, percent)
2657
+
2658
+ # # Get numerical columns and their correlations if applicable
2659
+ # num_cols = df.select_dtypes(include=['number']).columns
2660
+ # correlations = "No numerical columns to calculate correlations."
2661
+ # if len(num_cols) > 1:
2662
+ # # Calculate correlations
2663
+ # corr_matrix = df[num_cols].corr()
2664
+ # # Get top correlations (absolute values)
2665
+ # corr_pairs = []
2666
+ # for i in range(len(num_cols)):
2667
+ # for j in range(i):
2668
+ # val = corr_matrix.iloc[i, j]
2669
+ # if abs(val) > 0.5: # Only show strong correlations
2670
+ # corr_pairs.append((num_cols[i], num_cols[j], val))
2671
+
2672
+ # # Sort by absolute correlation and format
2673
+ # if corr_pairs:
2674
+ # corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
2675
+ # formatted_corrs = []
2676
+ # for col1, col2, val in corr_pairs[:5]: # Top 5
2677
+ # formatted_corrs.append(f"{col1} and {col2}: {val:.3f}")
2678
+ # correlations = "\n".join(formatted_corrs)
2679
+
2680
+ # dataset_info = {
2681
+ # "shape": f"{num_rows} rows, {num_cols} columns",
2682
+ # "columns": df.columns.tolist(),
2683
+ # "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
2684
+ # "missing_values": missing_values,
2685
+ # "basic_stats": df.describe().to_string(),
2686
+ # "correlations": correlations,
2687
+ # "sample_data": df.head(5).to_string()
2688
+ # }
2689
+
2690
+ # # Generate EDA insights with better error handling
2691
+ # logger.info("Requesting EDA insights from LLM")
2692
+ # try:
2693
+ # eda_insights = llm_inference.generate_eda_insights(dataset_info)
2694
+
2695
+ # if eda_insights and isinstance(eda_insights, str) and len(eda_insights) > 50:
2696
+ # # Clean and format the response
2697
+ # eda_insights = eda_insights.strip()
2698
+ # insights["EDA Insights"] = [eda_insights]
2699
+ # logger.info("Successfully generated EDA insights")
2700
+ # else:
2701
+ # logger.warning(f"EDA insights response was invalid: {type(eda_insights)}, length: {len(eda_insights) if isinstance(eda_insights, str) else 'N/A'}")
2702
+ # except Exception as e:
2703
+ # logger.error(f"Error generating EDA insights: {str(e)}")
2704
+
2705
+ # # Generate feature engineering recommendations
2706
+ # if "EDA Insights" in insights: # Only proceed if EDA worked
2707
+ # logger.info("Requesting feature engineering recommendations from LLM")
2708
+ # try:
2709
+ # fe_insights = llm_inference.generate_feature_engineering_recommendations(dataset_info)
2710
+
2711
+ # if fe_insights and isinstance(fe_insights, str) and len(fe_insights) > 50:
2712
+ # fe_insights = fe_insights.strip()
2713
+ # insights["Feature Engineering Recommendations"] = [fe_insights]
2714
+ # logger.info("Successfully generated feature engineering recommendations")
2715
+ # else:
2716
+ # logger.warning(f"Feature engineering response was invalid: {type(fe_insights)}, length: {len(fe_insights) if isinstance(fe_insights, str) else 'N/A'}")
2717
+ # except Exception as e:
2718
+ # logger.error(f"Error generating feature engineering recommendations: {str(e)}")
2719
+
2720
+ # # Generate data quality insights
2721
+ # logger.info("Requesting data quality insights from LLM")
2722
+ # try:
2723
+ # dq_insights = llm_inference.generate_data_quality_insights(dataset_info)
2724
+
2725
+ # if dq_insights and isinstance(dq_insights, str) and len(dq_insights) > 50:
2726
+ # dq_insights = dq_insights.strip()
2727
+ # insights["Data Quality Insights"] = [dq_insights]
2728
+ # logger.info("Successfully generated data quality insights")
2729
+ # else:
2730
+ # logger.warning(f"Data quality response was invalid: {type(dq_insights)}, length: {len(dq_insights) if isinstance(dq_insights, str) else 'N/A'}")
2731
+ # except Exception as e:
2732
+ # logger.error(f"Error generating data quality insights: {str(e)}")
2733
+
2734
+ # # If we have at least one type of insights, consider it a success
2735
+ # if insights:
2736
+ # # Mark that the insights are loaded
2737
+ # st.session_state['loading_insights'] = False
2738
+ # logger.info("Successfully generated AI insights using LLM")
2739
+ # return insights
2740
+
2741
+ # logger.warning("All LLM generated insights failed or were too short. Falling back to template insights.")
2742
+ # else:
2743
+ # logger.warning("LLM inference is not available. Falling back to template insights.")
2744
+ # except Exception as e:
2745
+ # logger.error(f"Error in generate_ai_insights(): {str(e)}. Falling back to template insights.")
2746
+
2747
+ # # If LLM fails or is not available, generate template-based insights
2748
+ # logger.info("Falling back to template-based insights generation")
2749
+
2750
+ # # Add missing values insights
2751
+ # missing_data = df.isnull().sum()
2752
+ # missing_percent = (missing_data / len(df)) * 100
2753
+ # missing_cols = missing_data[missing_data > 0]
2754
+
2755
+ # missing_insights = []
2756
+ # if len(missing_cols) > 0:
2757
+ # missing_insights.append(f"Found {len(missing_cols)} columns with missing values.")
2758
+ # for col in missing_cols.index[:3]: # Show details for top 3
2759
+ # missing_insights.append(f"Column '{col}' has {missing_data[col]} missing values ({missing_percent[col]:.2f}%).")
2760
+
2761
+ # if len(missing_cols) > 3:
2762
+ # missing_insights.append(f"And {len(missing_cols) - 3} more columns have missing values.")
2763
+
2764
+ # # Add recommendation
2765
+ # if any(missing_percent > 50):
2766
+ # high_missing = missing_percent[missing_percent > 50].index.tolist()
2767
+ # missing_insights.append(f"Consider dropping columns with >50% missing values: {', '.join(high_missing[:3])}.")
2768
+ # else:
2769
+ # missing_insights.append("Consider using imputation techniques for columns with missing values.")
2770
+ # else:
2771
+ # missing_insights.append("No missing values found in the dataset. Great job!")
2772
+
2773
+ # insights["Missing Values Analysis"] = missing_insights
2774
+
2775
+ # # Add distribution insights
2776
+ # num_cols = df.select_dtypes(include=['number']).columns
2777
+ # dist_insights = []
2778
+
2779
+ # if len(num_cols) > 0:
2780
+ # for col in num_cols[:3]: # Analyze top 3 numeric columns
2781
+ # # Check for skewness
2782
+ # skew = df[col].skew()
2783
+ # if abs(skew) > 1:
2784
+ # direction = "right" if skew > 0 else "left"
2785
+ # dist_insights.append(f"Column '{col}' is {direction}-skewed (skewness: {skew:.2f}). Consider log transformation.")
2786
+
2787
+ # # Check for outliers using IQR
2788
+ # Q1 = df[col].quantile(0.25)
2789
+ # Q3 = df[col].quantile(0.75)
2790
+ # IQR = Q3 - Q1
2791
+ # outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))][col].count()
2792
+
2793
+ # if outliers > 0:
2794
+ # pct = (outliers / len(df)) * 100
2795
+ # dist_insights.append(f"Column '{col}' has {outliers} outliers ({pct:.2f}%). Consider outlier treatment.")
2796
+
2797
+ # if len(num_cols) > 3:
2798
+ # dist_insights.append(f"Additional {len(num_cols) - 3} numerical columns not analyzed here.")
2799
+ # else:
2800
+ # dist_insights.append("No numerical columns found for distribution analysis.")
2801
+
2802
+ # insights["Distribution Insights"] = dist_insights
2803
+
2804
+ # # Add correlation insights
2805
+ # corr_insights = []
2806
+ # if len(num_cols) > 1:
2807
+ # # Calculate correlation
2808
+ # corr_matrix = df[num_cols].corr()
2809
+ # high_corr = []
2810
+
2811
+ # # Find high correlations
2812
+ # for i in range(len(corr_matrix.columns)):
2813
+ # for j in range(i):
2814
+ # if abs(corr_matrix.iloc[i, j]) > 0.7:
2815
+ # high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
2816
+
2817
+ # if high_corr:
2818
+ # corr_insights.append(f"Found {len(high_corr)} pairs of highly correlated features.")
2819
+ # for col1, col2, corr_val in high_corr[:3]: # Show top 3
2820
+ # corr_direction = "positively" if corr_val > 0 else "negatively"
2821
+ # corr_insights.append(f"'{col1}' and '{col2}' are strongly {corr_direction} correlated (r={corr_val:.2f}).")
2822
+
2823
+ # if len(high_corr) > 3:
2824
+ # corr_insights.append(f"And {len(high_corr) - 3} more highly correlated pairs found.")
2825
+
2826
+ # corr_insights.append("Consider removing some highly correlated features to reduce dimensionality.")
2827
+ # else:
2828
+ # corr_insights.append("No strong correlations found between features.")
2829
+ # else:
2830
+ # corr_insights.append("Need at least 2 numerical columns to analyze correlations.")
2831
+
2832
+ # insights["Correlation Analysis"] = corr_insights
2833
+
2834
+ # # Add feature engineering recommendations
2835
+ # fe_insights = []
2836
+
2837
+ # # Check for date columns
2838
+ # date_cols = []
2839
+ # for col in df.columns.tolist():
2840
+ # if df[col].dtype == 'object':
2841
+ # try:
2842
+ # pd.to_datetime(df[col])
2843
+ # date_cols.append(col)
2844
+ # except:
2845
+ # pass
2846
+
2847
+ # if date_cols:
2848
+ # fe_insights.append(f"Found {len(date_cols)} potential date columns: {', '.join(date_cols[:3])}.")
2849
+ # fe_insights.append("Consider extracting year, month, day, weekday from these columns.")
2850
+
2851
+ # # Check for categorical columns
2852
+ # cat_cols = df.select_dtypes(include=['object']).columns
2853
+ # if len(cat_cols) > 0:
2854
+ # fe_insights.append(f"Found {len(cat_cols)} categorical columns.")
2855
+ # fe_insights.append("Consider one-hot encoding or label encoding for categorical features.")
2856
+
2857
+ # # Check for high cardinality
2858
+ # high_card_cols = []
2859
+ # for col in cat_cols:
2860
+ # if df[col].nunique() > 10:
2861
+ # high_card_cols.append((col, df[col].nunique()))
2862
+
2863
+ # if high_card_cols:
2864
+ # fe_insights.append(f"Some categorical columns have high cardinality:")
2865
+ # for col, card in high_card_cols[:2]:
2866
+ # fe_insights.append(f"Column '{col}' has {card} unique values. Consider grouping less common categories.")
2867
+
2868
+ # # Suggest polynomial features if few numeric features
2869
+ # if 1 < len(num_cols) < 5:
2870
+ # fe_insights.append("Consider creating polynomial features or interaction terms between numerical features.")
2871
+
2872
+ # insights["Feature Engineering Recommendations"] = fe_insights
2873
+
2874
+ # # Add a slight delay to simulate processing
2875
+ # time.sleep(1)
2876
+
2877
+ # # Mark that the insights are loaded
2878
+ # st.session_state['loading_insights'] = False
2879
+ # logger.info("Template-based insights generation completed")
2880
+
2881
+ # return insights
2882
+
2883
+ # def display_chat_interface():
2884
+ # """Display a chat interface for interacting with the data"""
2885
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
2886
+ # st.markdown('<h2 class="tab-title">💬 Chat with Your Data</h2>', unsafe_allow_html=True)
2887
+
2888
+ # # Initialize chat history if not present
2889
+ # if "chat_history" not in st.session_state:
2890
+ # st.session_state.chat_history = []
2891
+
2892
+ # # Make sure we have data to chat about
2893
+ # if 'df' not in st.session_state or st.session_state.df is None:
2894
+ # st.error("No dataset loaded. Please upload a CSV file to chat with your data.")
2895
+
2896
+ # # Show a preview of chat capabilities
2897
+ # st.markdown("""
2898
+ # <div style="margin-top: 2rem;">
2899
+ # <h3>What can I help you with?</h3>
2900
+ # <p>Once you upload a dataset, you can ask questions like:</p>
2901
+ # <ul>
2902
+ # <li>What patterns do you see in my data?</li>
2903
+ # <li>How many missing values are there?</li>
2904
+ # <li>What feature engineering would you recommend?</li>
2905
+ # <li>Show me the distribution of a specific column</li>
2906
+ # <li>What are the correlations between features?</li>
2907
+ # </ul>
2908
+ # </div>
2909
+ # """, unsafe_allow_html=True)
2910
+
2911
+ # st.markdown('</div>', unsafe_allow_html=True)
2912
+ # return
2913
+
2914
+ # # Display chat history
2915
+ # for message in st.session_state.chat_history:
2916
+ # if message["role"] == "user":
2917
+ # st.chat_message("user").write(message["content"])
2918
+ # else:
2919
+ # st.chat_message("assistant").write(message["content"])
2920
+
2921
+ # # If no chat history, show some example questions
2922
+ # if not st.session_state.chat_history:
2923
+ # st.info("Ask me anything about your dataset! I can help you understand patterns, identify issues, and suggest improvements.")
2924
+
2925
+ # st.markdown("### Example questions you can ask:")
2926
+
2927
+ # # Create a grid of example questions using columns
2928
+ # col1, col2 = st.columns(2)
2929
+
2930
+ # with col1:
2931
+ # example_questions = [
2932
+ # "What are the key patterns in this dataset?",
2933
+ # "Which columns have missing values?",
2934
+ # "What kind of feature engineering would help?"
2935
+ # ]
2936
+
2937
+ # for i, question in enumerate(example_questions):
2938
+ # if st.button(question, key=f"example_q_{i}"):
2939
+ # process_chat_message(question)
2940
+ # st.rerun()
2941
+
2942
+ # with col2:
2943
+ # more_questions = [
2944
+ # "How are the numerical variables distributed?",
2945
+ # "What are the strongest correlations?",
2946
+ # "How can I prepare this data for modeling?"
2947
+ # ]
2948
+
2949
+ # for i, question in enumerate(more_questions):
2950
+ # if st.button(question, key=f"example_q_{i+3}"):
2951
+ # process_chat_message(question)
2952
+ # st.rerun()
2953
+
2954
+ # # Input area for new messages
2955
+ # user_input = st.chat_input("Ask a question about your data...", key="chat_input")
2956
+
2957
+ # if user_input:
2958
+ # # Add user message to chat history
2959
+ # process_chat_message(user_input)
2960
+ # st.rerun()
2961
+
2962
+ # st.markdown('</div>', unsafe_allow_html=True)
2963
+
2964
+ # def display_descriptive_tab():
2965
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
2966
+ # st.markdown('<h2 class="tab-title">📊 Descriptive Statistics</h2>', unsafe_allow_html=True)
2967
+
2968
+ # # Make sure we access the data from session state
2969
+ # if 'df' not in st.session_state or 'descriptive_stats' not in st.session_state:
2970
+ # st.error("No dataset loaded. Please upload a CSV file.")
2971
+ # st.markdown('</div>', unsafe_allow_html=True)
2972
+ # return
2973
+
2974
+ # df = st.session_state.df
2975
+ # descriptive_stats = st.session_state.descriptive_stats
2976
+
2977
+ # # Display descriptive statistics in a more visually appealing way
2978
+ # col1, col2 = st.columns([3, 1])
2979
+
2980
+ # with col1:
2981
+ # # Style the dataframe
2982
+ # st.markdown('<div class="stats-card">', unsafe_allow_html=True)
2983
+ # st.subheader("Numerical Summary")
2984
+ # st.dataframe(descriptive_stats.style.background_gradient(cmap='Blues', axis=0)
2985
+ # .format(precision=2, na_rep="Missing"), use_container_width=True)
2986
+ # st.markdown('</div>', unsafe_allow_html=True)
2987
+
2988
+ # with col2:
2989
+ # st.markdown('<div class="info-card">', unsafe_allow_html=True)
2990
+ # st.subheader("Dataset Overview")
2991
+
2992
+ # # Display dataset information in a cleaner format
2993
+ # total_rows = df.shape[0]
2994
+ # total_cols = df.shape[1]
2995
+ # numeric_cols = len(df.select_dtypes(include=['number']).columns)
2996
+ # cat_cols = len(df.select_dtypes(include=['object', 'category']).columns)
2997
+ # date_cols = len(df.select_dtypes(include=['datetime']).columns)
2998
+
2999
+ # st.markdown(f"""
3000
+ # <div class="dataset-stats">
3001
+ # <div class="stat-item">
3002
+ # <div class="stat-value">{total_rows:,}</div>
3003
+ # <div class="stat-label">Rows</div>
3004
+ # </div>
3005
+ # <div class="stat-item">
3006
+ # <div class="stat-value">{total_cols}</div>
3007
+ # <div class="stat-label">Columns</div>
3008
+ # </div>
3009
+ # <div class="stat-item">
3010
+ # <div class="stat-value">{numeric_cols}</div>
3011
+ # <div class="stat-label">Numerical</div>
3012
+ # </div>
3013
+ # <div class="stat-item">
3014
+ # <div class="stat-value">{cat_cols}</div>
3015
+ # <div class="stat-label">Categorical</div>
3016
+ # </div>
3017
+ # <div class="stat-item">
3018
+ # <div class="stat-value">{date_cols}</div>
3019
+ # <div class="stat-label">Date/Time</div>
3020
+ # </div>
3021
+ # </div>
3022
+ # """, unsafe_allow_html=True)
3023
+ # st.markdown('</div>', unsafe_allow_html=True)
3024
+
3025
+ # # Add missing values information with visualization
3026
+ # st.markdown('<div class="stats-card">', unsafe_allow_html=True)
3027
+ # st.subheader("Missing Values")
3028
+ # col1, col2 = st.columns([2, 3])
3029
+
3030
+ # with col1:
3031
+ # # Calculate missing values
3032
+ # missing_data = df.isnull().sum()
3033
+ # missing_percent = (missing_data / len(df)) * 100
3034
+ # missing_data = pd.DataFrame({
3035
+ # 'Missing Values': missing_data,
3036
+ # 'Percentage (%)': missing_percent.round(2)
3037
+ # })
3038
+ # missing_data = missing_data[missing_data['Missing Values'] > 0].sort_values('Missing Values', ascending=False)
3039
+
3040
+ # if not missing_data.empty:
3041
+ # st.dataframe(missing_data.style.background_gradient(cmap='Reds', subset=['Percentage (%)'])
3042
+ # .format({'Percentage (%)': '{:.2f}%'}), use_container_width=True)
3043
+ # else:
3044
+ # st.success("No missing values found in the dataset! 🎉")
3045
+
3046
+ # with col2:
3047
+ # if not missing_data.empty:
3048
+ # # Create a horizontal bar chart for missing values
3049
+ # fig = px.bar(missing_data,
3050
+ # x='Percentage (%)',
3051
+ # y=missing_data.index,
3052
+ # orientation='h',
3053
+ # color='Percentage (%)',
3054
+ # color_continuous_scale='Reds',
3055
+ # title='Missing Values by Column')
3056
+
3057
+ # fig.update_layout(
3058
+ # height=max(350, len(missing_data) * 30),
3059
+ # xaxis_title='Missing (%)',
3060
+ # yaxis_title='',
3061
+ # coloraxis_showscale=False,
3062
+ # margin=dict(l=0, r=10, t=30, b=0)
3063
+ # )
3064
+
3065
+ # st.plotly_chart(fig, use_container_width=True)
3066
+
3067
+ # st.markdown('</div>', unsafe_allow_html=True)
3068
+ # st.markdown('</div>', unsafe_allow_html=True)
3069
+
3070
+ # def display_distribution_tab():
3071
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
3072
+ # st.markdown('<h2 class="tab-title">📈 Data Distribution</h2>', unsafe_allow_html=True)
3073
+
3074
+ # # Make sure we access the data from session state
3075
+ # if 'df' not in st.session_state:
3076
+ # st.error("No dataset loaded. Please upload a CSV file.")
3077
+ # st.markdown('</div>', unsafe_allow_html=True)
3078
+ # return
3079
+
3080
+ # df = st.session_state.df
3081
+
3082
+ # # Add filters for better UX
3083
+ # st.markdown('<div class="filter-container">', unsafe_allow_html=True)
3084
+ # col1, col2 = st.columns([1, 1])
3085
+
3086
+ # with col1:
3087
+ # chart_type = st.selectbox(
3088
+ # "Select Chart Type",
3089
+ # ["Histogram", "Box Plot", "Violin Plot", "Distribution Plot"],
3090
+ # key="chart_type_select"
3091
+ # )
3092
+
3093
+ # # with col2:
3094
+ # # if chart_type != "Distribution Plot":
3095
+ # # column_type = "Numerical" if chart_type in ["Histogram", "Box Plot", "Violin Plot"] else "Categorical"
3096
+ # # columns_to_show = df.select_dtypes(include=['number']).columns.tolist() if column_type == "Numerical" else df.select_dtypes(include=['object', 'category']).columns.tolist()
3097
+
3098
+ # # selected_columns = st.multiselect(
3099
+ # # f"Select {column_type} Columns to Visualize",
3100
+ # # options=columns_to_show,
3101
+ # # default=columns_to_show[:min(3, len(columns_to_show))],
3102
+ # # key="column_select"
3103
+ # # )
3104
+ # # else:
3105
+ # # num_cols = df.select_dtypes(include=['number']).columns.tolist()
3106
+ # # selected_columns = st.multiselect(
3107
+ # # "Select Numerical Columns",
3108
+ # # options=num_cols,
3109
+ # # default=num_cols[:min(3, len(num_cols))],
3110
+ # # key="column_select"
3111
+ # # )
3112
+
3113
+
3114
+
3115
+ # with col2:
3116
+ # if chart_type != "Distribution Plot":
3117
+ # column_type = "Numerical" if chart_type in ["Histogram", "Box Plot", "Violin Plot"] else "Categorical"
3118
+ # columns_to_show = list(df.select_dtypes(include=['number']).columns) if column_type == "Numerical" else list(df.select_dtypes(include=['object', 'category']).columns)
3119
+
3120
+ # selected_columns = st.multiselect(
3121
+ # f"Select {column_type} Columns to Visualize",
3122
+ # options=columns_to_show,
3123
+ # default=list(columns_to_show[:min(3, len(columns_to_show))]), # Convert to list ✅
3124
+ # key="column_select"
3125
+ # )
3126
+ # else:
3127
+ # num_cols = list(df.select_dtypes(include=['number']).columns) # Convert to list ✅
3128
+ # selected_columns = st.multiselect(
3129
+ # "Select Numerical Columns",
3130
+ # options=num_cols,
3131
+ # default=list(num_cols[:min(3, len(num_cols))]), # Convert to list ✅
3132
+ # key="column_select"
3133
+ # )
3134
+
3135
+ # st.markdown('</div>', unsafe_allow_html=True)
3136
+
3137
+ # # Display selected charts
3138
+ # if selected_columns:
3139
+ # st.markdown('<div class="chart-container">', unsafe_allow_html=True)
3140
+
3141
+ # if chart_type == "Histogram":
3142
+ # col1, col2 = st.columns([3, 1])
3143
+ # with col2:
3144
+ # bins = st.slider("Number of bins", min_value=5, max_value=100, value=30, key="hist_bins")
3145
+ # kde = st.checkbox("Show KDE", value=True, key="show_kde")
3146
+
3147
+ # with col1:
3148
+ # pass
3149
+
3150
+ # # Display histograms with better styling
3151
+ # for column in selected_columns:
3152
+ # st.markdown(f'<div class="chart-card"><h3>{column}</h3>', unsafe_allow_html=True)
3153
+ # fig = px.histogram(df, x=column, nbins=bins,
3154
+ # title=f"Histogram of {column}",
3155
+ # marginal="box" if kde else None,
3156
+ # color_discrete_sequence=['rgba(99, 102, 241, 0.7)'])
3157
+
3158
+ # fig.update_layout(
3159
+ # template="plotly_white",
3160
+ # height=400,
3161
+ # margin=dict(l=10, r=10, t=40, b=10),
3162
+ # xaxis_title=column,
3163
+ # yaxis_title="Frequency",
3164
+ # bargap=0.1
3165
+ # )
3166
+
3167
+ # st.plotly_chart(fig, use_container_width=True)
3168
+
3169
+ # # Show basic statistics
3170
+ # stats = df[column].describe().to_dict()
3171
+ # st.markdown(f"""
3172
+ # <div class="stat-summary">
3173
+ # <div class="stat-pair"><span>Mean:</span> <strong>{stats['mean']:.2f}</strong></div>
3174
+ # <div class="stat-pair"><span>Median:</span> <strong>{stats['50%']:.2f}</strong></div>
3175
+ # <div class="stat-pair"><span>Std Dev:</span> <strong>{stats['std']:.2f}</strong></div>
3176
+ # <div class="stat-pair"><span>Min:</span> <strong>{stats['min']:.2f}</strong></div>
3177
+ # <div class="stat-pair"><span>Max:</span> <strong>{stats['max']:.2f}</strong></div>
3178
+ # </div>
3179
+ # """, unsafe_allow_html=True)
3180
+ # st.markdown('</div>', unsafe_allow_html=True)
3181
+
3182
+ # elif chart_type == "Box Plot":
3183
+ # for column in selected_columns:
3184
+ # st.markdown(f'<div class="chart-card"><h3>{column}</h3>', unsafe_allow_html=True)
3185
+ # fig = px.box(df, y=column, title=f"Box Plot of {column}",
3186
+ # color_discrete_sequence=['rgba(99, 102, 241, 0.7)'])
3187
+
3188
+ # fig.update_layout(
3189
+ # template="plotly_white",
3190
+ # height=400,
3191
+ # margin=dict(l=10, r=10, t=40, b=10),
3192
+ # yaxis_title=column
3193
+ # )
3194
+
3195
+ # st.plotly_chart(fig, use_container_width=True)
3196
+
3197
+ # # Show outlier information
3198
+ # q1 = df[column].quantile(0.25)
3199
+ # q3 = df[column].quantile(0.75)
3200
+ # iqr = q3 - q1
3201
+ # lower_bound = q1 - 1.5 * iqr
3202
+ # upper_bound = q3 + 1.5 * iqr
3203
+ # outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
3204
+
3205
+ # st.markdown(f"""
3206
+ # <div class="stat-summary">
3207
+ # <div class="stat-pair"><span>Q1 (25%):</span> <strong>{q1:.2f}</strong></div>
3208
+ # <div class="stat-pair"><span>Median:</span> <strong>{df[column].median():.2f}</strong></div>
3209
+ # <div class="stat-pair"><span>Q3 (75%):</span> <strong>{q3:.2f}</strong></div>
3210
+ # <div class="stat-pair"><span>IQR:</span> <strong>{iqr:.2f}</strong></div>
3211
+ # <div class="stat-pair"><span>Outliers:</span> <strong>{len(outliers)}</strong> ({(len(outliers)/len(df)*100):.2f}%)</div>
3212
+ # </div>
3213
+ # """, unsafe_allow_html=True)
3214
+ # st.markdown('</div>', unsafe_allow_html=True)
3215
+
3216
+ # elif chart_type == "Violin Plot":
3217
+ # for column in selected_columns:
3218
+ # st.markdown(f'<div class="chart-card"><h3>{column}</h3>', unsafe_allow_html=True)
3219
+ # fig = px.violin(df, y=column, box=True, points="all", title=f"Violin Plot of {column}",
3220
+ # color_discrete_sequence=['rgba(99, 102, 241, 0.7)'])
3221
+
3222
+ # fig.update_layout(
3223
+ # template="plotly_white",
3224
+ # height=400,
3225
+ # margin=dict(l=10, r=10, t=40, b=10),
3226
+ # yaxis_title=column
3227
+ # )
3228
+
3229
+ # fig.update_traces(marker=dict(size=3, opacity=0.5))
3230
+ # st.plotly_chart(fig, use_container_width=True)
3231
+ # st.markdown('</div>', unsafe_allow_html=True)
3232
+
3233
+ # elif chart_type == "Distribution Plot":
3234
+ # if len(selected_columns) >= 2:
3235
+ # st.markdown('<div class="chart-card">', unsafe_allow_html=True)
3236
+ # chart_options = st.radio(
3237
+ # "Select Distribution Plot Type",
3238
+ # ["Scatter Plot", "Correlation Heatmap"],
3239
+ # horizontal=True
3240
+ # )
3241
+
3242
+ # if chart_options == "Scatter Plot":
3243
+ # col1, col2 = st.columns([3, 1])
3244
+ # with col2:
3245
+ # x_axis = st.selectbox("X-axis", options=selected_columns, index=0)
3246
+ # y_axis = st.selectbox("Y-axis", options=selected_columns, index=min(1, len(selected_columns)-1))
3247
+ # color_option = st.selectbox("Color by", options=["None"] + df.columns.tolist())
3248
+
3249
+ # with col1:
3250
+ # if color_option != "None":
3251
+ # fig = px.scatter(df, x=x_axis, y=y_axis,
3252
+ # color=color_option,
3253
+ # title=f"{y_axis} vs {x_axis} (colored by {color_option})",
3254
+ # opacity=0.7,
3255
+ # marginal_x="histogram", marginal_y="histogram")
3256
+ # else:
3257
+ # fig = px.scatter(df, x=x_axis, y=y_axis,
3258
+ # title=f"{y_axis} vs {x_axis}",
3259
+ # opacity=0.7,
3260
+ # marginal_x="histogram", marginal_y="histogram")
3261
+
3262
+ # fig.update_layout(
3263
+ # template="plotly_white",
3264
+ # height=600,
3265
+ # margin=dict(l=10, r=10, t=40, b=10),
3266
+ # )
3267
+
3268
+ # st.plotly_chart(fig, use_container_width=True)
3269
+
3270
+ # elif chart_options == "Correlation Heatmap":
3271
+ # # Calculate correlation matrix
3272
+ # corr_matrix = df[selected_columns].corr()
3273
+
3274
+ # # Create heatmap
3275
+ # fig = px.imshow(corr_matrix,
3276
+ # text_auto=".2f",
3277
+ # color_continuous_scale="RdBu_r",
3278
+ # zmin=-1, zmax=1,
3279
+ # title="Correlation Heatmap")
3280
+
3281
+ # fig.update_layout(
3282
+ # template="plotly_white",
3283
+ # height=600,
3284
+ # margin=dict(l=10, r=10, t=40, b=10),
3285
+ # )
3286
+
3287
+ # st.plotly_chart(fig, use_container_width=True)
3288
+
3289
+ # # Show highest correlations
3290
+ # corr_df = corr_matrix.stack().reset_index()
3291
+ # corr_df.columns = ['Variable 1', 'Variable 2', 'Correlation']
3292
+ # corr_df = corr_df[corr_df['Variable 1'] != corr_df['Variable 2']]
3293
+ # corr_df = corr_df.sort_values('Correlation', ascending=False).head(5)
3294
+
3295
+ # st.markdown("##### Top 5 Highest Correlations")
3296
+ # st.dataframe(corr_df.style.background_gradient(cmap='Blues')
3297
+ # .format({'Correlation': '{:.2f}'}), use_container_width=True)
3298
+ # st.markdown('</div>', unsafe_allow_html=True)
3299
+ # else:
3300
+ # st.warning("Please select at least 2 numerical columns to see distribution plots")
3301
+
3302
+ # st.markdown('</div>', unsafe_allow_html=True)
3303
+ # else:
3304
+ # st.info("Please select at least one column to visualize")
3305
+
3306
+ # st.markdown('</div>', unsafe_allow_html=True)
3307
+
3308
+ # def display_ai_insights_tab():
3309
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
3310
+ # st.markdown('<h2 class="tab-title">🧠 AI-Generated Insights</h2>', unsafe_allow_html=True)
3311
+
3312
+ # # Make sure we access the data from session state
3313
+ # if 'df' not in st.session_state:
3314
+ # st.error("No dataset loaded. Please upload a CSV file.")
3315
+ # st.markdown('</div>', unsafe_allow_html=True)
3316
+ # return
3317
+
3318
+ # if st.session_state.get('loading_insights', False):
3319
+ # with st.spinner("Generating AI insights about your data..."):
3320
+ # st.markdown('<div class="loading-container"><div class="loading-pulse"></div></div>', unsafe_allow_html=True)
3321
+ # time.sleep(0.1) # Small delay to ensure UI updates
3322
+
3323
+ # # AI insights section
3324
+ # if 'ai_insights' in st.session_state and st.session_state.ai_insights and len(st.session_state.ai_insights) > 0:
3325
+ # insights = st.session_state.ai_insights
3326
+
3327
+ # st.markdown('<div class="insights-container">', unsafe_allow_html=True)
3328
+
3329
+ # for i, (category, insight_list) in enumerate(insights.items()):
3330
+ # with st.expander(f"{category}", expanded=i < 2):
3331
+ # st.markdown('<div class="insights-category">', unsafe_allow_html=True)
3332
+
3333
+ # # Check if the insights are from LLM (single string) or template (list of strings)
3334
+ # if len(insight_list) == 1 and isinstance(insight_list[0], str) and len(insight_list[0]) > 100:
3335
+ # # This is likely an LLM-generated insight (single long string)
3336
+ # st.markdown(insight_list[0])
3337
+ # else:
3338
+ # # Template-based insights (list of strings)
3339
+ # for insight in insight_list:
3340
+ # st.markdown(f"""
3341
+ # <div class="insight-card">
3342
+ # <div class="insight-content">
3343
+ # <div class="insight-icon">💡</div>
3344
+ # <div class="insight-text">{insight}</div>
3345
+ # </div>
3346
+ # </div>
3347
+ # """, unsafe_allow_html=True)
3348
+
3349
+ # st.markdown('</div>', unsafe_allow_html=True)
3350
+
3351
+ # st.markdown('</div>', unsafe_allow_html=True)
3352
+
3353
+ # # Add regenerate button
3354
+ # st.markdown('<div style="text-align: center; margin-top: 20px;">', unsafe_allow_html=True)
3355
+ # if st.button("Regenerate Insights", key="regenerate_insights"):
3356
+ # st.session_state['loading_insights'] = True
3357
+ # st.session_state['ai_insights'] = None
3358
+ # logger.info("User requested regeneration of AI insights")
3359
+ # st.rerun()
3360
+ # st.markdown('</div>', unsafe_allow_html=True)
3361
+ # else:
3362
+ # if not st.session_state.get('loading_insights', False):
3363
+ # # Show generate button if insights are not loading and not available
3364
+ # st.markdown('<div class="generate-insights-container">', unsafe_allow_html=True)
3365
+ # st.markdown("""
3366
+ # <div class="placeholder-card">
3367
+ # <div class="placeholder-icon">🧠</div>
3368
+ # <div class="placeholder-text">Generate AI-powered insights about your dataset to discover patterns, anomalies, and suggestions for feature engineering.</div>
3369
+ # </div>
3370
+ # """, unsafe_allow_html=True)
3371
+ # if st.button("Generate Insights", key="generate_insights"):
3372
+ # st.session_state['loading_insights'] = True
3373
+ # logger.info("User initiated AI insights generation")
3374
+ # st.rerun()
3375
+ # st.markdown('</div>', unsafe_allow_html=True)
3376
+
3377
+ # st.markdown('</div>', unsafe_allow_html=True)
3378
+
3379
+ # def display_welcome_page():
3380
+ # """Display a welcome page with information about the application"""
3381
+ # # Use Streamlit columns and components instead of raw HTML
3382
+ # st.title("Welcome to AI-Powered EDA & Feature Engineering Assistant")
3383
+
3384
+ # st.write("""
3385
+ # Upload your CSV dataset and leverage the power of AI to analyze, visualize, and improve your data.
3386
+ # This tool helps you understand your data better and prepare it for machine learning models.
3387
+ # """)
3388
+
3389
+ # # Feature cards
3390
+ # st.subheader("Key Features")
3391
+
3392
+ # # Use Streamlit columns to create a grid layout
3393
+ # col1, col2 = st.columns(2)
3394
+
3395
+ # with col1:
3396
+ # st.markdown("#### 📊 Exploratory Data Analysis")
3397
+ # st.write("Quickly understand your dataset with automatic statistical analysis and visualizations")
3398
+
3399
+ # st.markdown("#### 🧠 AI-Powered Insights")
3400
+ # st.write("Get intelligent recommendations about patterns, anomalies, and opportunities in your data")
3401
+
3402
+ # st.markdown("#### ⚡ Feature Engineering")
3403
+ # st.write("Transform and enhance your features to improve machine learning model performance")
3404
+
3405
+ # with col2:
3406
+ # st.markdown("#### 📈 Interactive Visualizations")
3407
+ # st.write("Explore distributions, relationships, and outliers with dynamic charts")
3408
+
3409
+ # st.markdown("#### 💬 Chat Interface")
3410
+ # st.write("Ask questions about your data and get AI-powered answers in natural language")
3411
+
3412
+ # st.markdown("#### 🔄 Data Transformation")
3413
+ # st.write("Clean, transform, and prepare your data for modeling with guided workflows")
3414
+
3415
+ # # Usage section
3416
+ # st.subheader("How to use")
3417
+
3418
+ # st.markdown("""
3419
+ # 1. **Upload** your CSV dataset using the sidebar on the left
3420
+ # 2. **Explore** automatically generated statistics and visualizations
3421
+ # 3. **Generate** AI insights to better understand your data
3422
+ # 4. **Chat** with AI to ask specific questions about your dataset
3423
+ # 5. **Transform** your features based on recommendations
3424
+ # """)
3425
+
3426
+ # # # Powered by section
3427
+ # # st.subheader("Powered by")
3428
+ # # cols = st.columns(3)
3429
+ # # with cols[0]:
3430
+ # # st.markdown("**llama3-8b-8192**")
3431
+ # # with cols[1]:
3432
+ # # st.markdown("**Groq API**")
3433
+ # # with cols[2]:
3434
+ # # st.markdown("**Streamlit**")
3435
+
3436
+ # # Upload prompt
3437
+ # st.info("👈 Please upload a CSV file using the sidebar to get started")
3438
+
3439
+ # def display_relationships_tab():
3440
+ # """Display correlations and relationships between variables"""
3441
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
3442
+ # st.markdown('<h2 class="tab-title">🔄 Relationships & Correlations</h2>', unsafe_allow_html=True)
3443
+
3444
+ # # Make sure we have data to visualize
3445
+ # if 'df' not in st.session_state or st.session_state.df is None:
3446
+ # st.error("No dataset loaded. Please upload a CSV file.")
3447
+ # st.markdown('</div>', unsafe_allow_html=True)
3448
+ # return
3449
+
3450
+ # df = st.session_state.df
3451
+
3452
+ # # Select numerical columns for correlation analysis
3453
+ # num_cols = df.select_dtypes(include=['number']).columns
3454
+
3455
+ # if len(num_cols) < 2:
3456
+ # st.warning("At least 2 numerical columns are needed for correlation analysis.")
3457
+ # st.markdown('</div>', unsafe_allow_html=True)
3458
+ # return
3459
+
3460
+ # # Correlation matrix heatmap
3461
+ # st.subheader("Correlation Matrix")
3462
+
3463
+ # # Calculate correlation
3464
+ # corr_matrix = df[num_cols].corr()
3465
+
3466
+ # # Create correlation heatmap
3467
+ # fig = px.imshow(
3468
+ # corr_matrix,
3469
+ # text_auto=".2f",
3470
+ # color_continuous_scale="RdBu_r",
3471
+ # zmin=-1, zmax=1,
3472
+ # aspect="auto",
3473
+ # title="Correlation Heatmap"
3474
+ # )
3475
+
3476
+ # fig.update_layout(
3477
+ # height=600,
3478
+ # width=800,
3479
+ # title_font_size=20,
3480
+ # margin=dict(l=10, r=10, t=30, b=10)
3481
+ # )
3482
+
3483
+ # st.plotly_chart(fig, use_container_width=True)
3484
+
3485
+ # # Show top correlations
3486
+ # st.subheader("Top Correlations")
3487
+
3488
+ # # Extract and format correlations
3489
+ # corr_pairs = []
3490
+ # for i in range(len(num_cols)):
3491
+ # for j in range(i):
3492
+ # corr_pairs.append({
3493
+ # 'Feature 1': num_cols[i],
3494
+ # 'Feature 2': num_cols[j],
3495
+ # 'Correlation': corr_matrix.iloc[i, j]
3496
+ # })
3497
+
3498
+ # # Convert to dataframe and sort
3499
+ # corr_df = pd.DataFrame(corr_pairs)
3500
+ # sorted_corr = corr_df.sort_values('Correlation', key=abs, ascending=False).head(10)
3501
+
3502
+ # # Show table with styled background
3503
+ # st.dataframe(
3504
+ # sorted_corr.style.background_gradient(cmap='RdBu_r', subset=['Correlation'])
3505
+ # .format({'Correlation': '{:.3f}'}),
3506
+ # use_container_width=True
3507
+ # )
3508
+
3509
+ # # Scatter plot matrix
3510
+ # st.subheader("Scatter Plot Matrix")
3511
+
3512
+ # # # Let user choose columns
3513
+ # # selected_cols = st.multiselect(
3514
+ # # "Select columns for scatter plot matrix (max 5 recommended)",
3515
+ # # options=num_cols,
3516
+ # # default=num_cols[:min(4, len(num_cols))]
3517
+ # # )
3518
+
3519
+
3520
+ # # Convert num_cols to a list before using it in multiselect
3521
+ # num_cols = list(df.select_dtypes(include=['number']).columns)
3522
+
3523
+ # # Ensure default selection is also a list
3524
+ # selected_cols = st.multiselect(
3525
+ # "Select columns for scatter plot matrix (max 5 recommended)",
3526
+ # options=num_cols,
3527
+ # default=list(num_cols[:min(4, len(num_cols))]) # Convert to list ✅
3528
+ # )
3529
+
3530
+ # if selected_cols:
3531
+ # if len(selected_cols) > 5:
3532
+ # st.warning("More than 5 columns may make the plot hard to read.")
3533
+
3534
+ # color_col = st.selectbox("Color by", options=["None"] + df.columns.tolist())
3535
+
3536
+ # # Only pass the color parameter if not "None"
3537
+ # if color_col != "None":
3538
+ # fig = px.scatter_matrix(
3539
+ # df,
3540
+ # dimensions=selected_cols,
3541
+ # color=color_col,
3542
+ # opacity=0.7,
3543
+ # title="Scatter Plot Matrix"
3544
+ # )
3545
+ # else:
3546
+ # fig = px.scatter_matrix(
3547
+ # df,
3548
+ # dimensions=selected_cols,
3549
+ # opacity=0.7,
3550
+ # title="Scatter Plot Matrix"
3551
+ # )
3552
+
3553
+ # fig.update_layout(
3554
+ # height=700,
3555
+ # title_font_size=18,
3556
+ # margin=dict(l=10, r=10, t=30, b=10)
3557
+ # )
3558
+
3559
+ # st.plotly_chart(fig, use_container_width=True)
3560
+
3561
+ # st.markdown('</div>', unsafe_allow_html=True)
3562
+
3563
+ # def process_chat_message(user_message):
3564
+ # """Process a user message in the chat interface"""
3565
+ # # Add user message to chat history
3566
+ # st.session_state.chat_history.append({"role": "user", "content": user_message})
3567
+
3568
+ # # Generate a response from the AI
3569
+ # if 'df' in st.session_state and st.session_state.df is not None:
3570
+ # # Try to use LLM if available, otherwise fall back to templates
3571
+ # try:
3572
+ # if llm_inference is not None:
3573
+ # # Create a prompt about the dataset
3574
+ # df = st.session_state.df
3575
+
3576
+ # # Get basic dataset info
3577
+ # num_rows, num_cols = df.shape
3578
+ # num_numerical = len(df.select_dtypes(include=['number']).columns)
3579
+ # num_categorical = len(df.select_dtypes(include=['object', 'category']).columns)
3580
+ # num_missing = df.isnull().sum().sum()
3581
+ # missing_cols = df.isnull().sum()[df.isnull().sum() > 0]
3582
+
3583
+ # # Format missing values for better readability
3584
+ # missing_values = {}
3585
+ # for col in missing_cols.index:
3586
+ # count = missing_cols[col]
3587
+ # percent = round(count / len(df) * 100, 2)
3588
+ # missing_values[col] = (count, percent)
3589
+
3590
+ # # Get correlations for numerical columns
3591
+ # num_cols = df.select_dtypes(include=['number']).columns
3592
+ # correlations = "No numerical columns to calculate correlations."
3593
+ # if len(num_cols) > 1:
3594
+ # # Calculate correlations
3595
+ # corr_matrix = df[num_cols].corr()
3596
+ # # Get top 5 correlations (absolute values)
3597
+ # corr_pairs = []
3598
+ # for i in range(len(num_cols)):
3599
+ # for j in range(i):
3600
+ # val = corr_matrix.iloc[i, j]
3601
+ # if abs(val) > 0.5: # Only show strong correlations
3602
+ # corr_pairs.append((num_cols[i], num_cols[j], val))
3603
+
3604
+ # # Sort by absolute correlation and format
3605
+ # if corr_pairs:
3606
+ # corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
3607
+ # formatted_corrs = []
3608
+ # for col1, col2, val in corr_pairs[:5]: # Top 5
3609
+ # formatted_corrs.append(f"{col1} and {col2}: {val:.3f}")
3610
+ # correlations = "\n".join(formatted_corrs)
3611
+
3612
+ # # Create dataset_info dictionary for LLM
3613
+ # dataset_info = {
3614
+ # "shape": f"{num_rows} rows, {num_cols} columns",
3615
+ # "columns": df.columns.tolist(),
3616
+ # "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
3617
+ # "missing_values": missing_values,
3618
+ # "basic_stats": df.describe().to_string(),
3619
+ # "correlations": correlations,
3620
+ # "sample_data": df.head(5).to_string()
3621
+ # }
3622
+
3623
+ # # Generate response using LLM
3624
+ # logger.info(f"Sending question to LLM: {user_message}")
3625
+ # response = llm_inference.answer_dataset_question(user_message, dataset_info)
3626
+
3627
+ # # Log the raw response for debugging
3628
+ # logger.info(f"Raw LLM response: {response[:100]}...")
3629
+
3630
+ # # If response is not empty and is a valid string
3631
+ # if response and isinstance(response, str) and len(response) > 10:
3632
+ # # Clean up the response if needed
3633
+ # cleaned_response = response.strip()
3634
+
3635
+ # # Add to chat history
3636
+ # st.session_state.chat_history.append({"role": "assistant", "content": cleaned_response})
3637
+ # return
3638
+ # else:
3639
+ # logger.warning(f"LLM response too short or invalid: {response}")
3640
+ # raise Exception("LLM response too short or invalid")
3641
+ # else:
3642
+ # raise Exception("LLM not available")
3643
+
3644
+ # except Exception as e:
3645
+ # logger.warning(f"Error using LLM for chat response: {str(e)}. Falling back to templates.")
3646
+ # # Fall back happens below
3647
+
3648
+ # # If we're here, either there's no dataframe, LLM failed, or response was invalid
3649
+ # # Use template-based responses as fallback
3650
+ # if 'df' in st.session_state and st.session_state.df is not None:
3651
+ # df = st.session_state.df
3652
+
3653
+ # # Simple response templates
3654
+ # responses = {
3655
+ # "missing": f"I found {df.isnull().sum().sum()} missing values across the dataset. The columns with the most missing values are: {df.isnull().sum().sort_values(ascending=False).head(3).index.tolist()}.",
3656
+ # "pattern": "Looking at the data, I can see several interesting patterns. The numerical features show varied distributions, and there might be some correlations worth exploring further.",
3657
+ # "feature": "Based on the data, I'd recommend feature engineering steps like handling missing values, encoding categorical variables, and possibly creating interaction terms for highly correlated features.",
3658
+ # "distribution": f"The numerical variables show different distributions. Some appear to be normally distributed while others show skewness. Let me know if you want to see visualizations for specific columns.",
3659
+ # "correlation": "I detected several strong correlations in the dataset. You might want to look at the correlation heatmap in the Relationships tab for more details.",
3660
+ # "prepare": "To prepare this data for modeling, I suggest: 1) Handling missing values, 2) Encoding categorical variables, 3) Feature scaling, and 4) Possibly dimensionality reduction if you have many features."
3661
+ # }
3662
+
3663
+ # # Simple keyword matching for demo purposes
3664
+ # if "missing" in user_message.lower():
3665
+ # response = responses["missing"]
3666
+ # elif "pattern" in user_message.lower():
3667
+ # response = responses["pattern"]
3668
+ # elif "feature" in user_message.lower() or "engineering" in user_message.lower():
3669
+ # response = responses["feature"]
3670
+ # elif "distribut" in user_message.lower():
3671
+ # response = responses["distribution"]
3672
+ # elif "correlat" in user_message.lower() or "relation" in user_message.lower():
3673
+ # response = responses["correlation"]
3674
+ # elif "prepare" in user_message.lower() or "model" in user_message.lower():
3675
+ # response = responses["prepare"]
3676
+ # else:
3677
+ # # Generic response
3678
+ # response = "I analyzed your dataset and found some interesting insights. You can explore different aspects of your data using the tabs above. Is there anything specific you'd like to know about your data?"
3679
+ # else:
3680
+ # response = "Please upload a dataset first so I can analyze it and answer your questions."
3681
+
3682
+ # # Add AI response to chat history
3683
+ # st.session_state.chat_history.append({"role": "assistant", "content": response})
3684
+
3685
+ # def main():
3686
+ # """Main function to run the application"""
3687
+ # # Initialize session state at the beginning
3688
+ # initialize_session_state()
3689
+
3690
+ # # Apply CSS styling
3691
+ # apply_custom_css()
3692
+
3693
+ # # Sidebar for file upload and settings
3694
+ # with st.sidebar:
3695
+ # st.markdown('<div class="sidebar-header">AI-Powered EDA & Feature Engineering</div>', unsafe_allow_html=True)
3696
+
3697
+ # # File uploader
3698
+ # st.markdown('<div class="sidebar-section">', unsafe_allow_html=True)
3699
+ # st.markdown('### Upload Dataset')
3700
+ # uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
3701
+ # st.markdown('</div>', unsafe_allow_html=True)
3702
+
3703
+ # # Load example dataset
3704
+ # with st.expander("Or use an example dataset"):
3705
+ # example_datasets = {
3706
+ # "Iris": "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
3707
+ # "Tips": "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv",
3708
+ # "Titanic": "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv",
3709
+ # "Diamonds": "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv"
3710
+ # }
3711
+ # selected_example = st.selectbox("Select example dataset", list(example_datasets.keys()))
3712
+ # if st.button("Load Example", key="load_example_btn"):
3713
+ # try:
3714
+ # # Load the selected example dataset
3715
+ # df = pd.read_csv(example_datasets[selected_example])
3716
+
3717
+ # # Verify we have a valid dataframe
3718
+ # if df is not None and not df.empty:
3719
+ # st.session_state['df'] = df
3720
+ # st.session_state['descriptive_stats'] = df.describe()
3721
+ # st.session_state['dataset_name'] = selected_example
3722
+ # st.success(f"Loaded {selected_example} dataset!")
3723
+ # else:
3724
+ # st.error(f"The {selected_example} dataset appears to be empty.")
3725
+ # except Exception as e:
3726
+ # st.error(f"Error loading example dataset: {str(e)}")
3727
+
3728
+ # # Only show these sections if a dataset is loaded
3729
+ # if 'df' in st.session_state:
3730
+ # # Dataset Info
3731
+ # st.markdown('<div class="sidebar-section">', unsafe_allow_html=True)
3732
+ # st.markdown(f'### Dataset Info: {st.session_state.get("dataset_name", "Uploaded Data")}')
3733
+ # df = st.session_state.df
3734
+ # # Add check to ensure df is not None before accessing shape
3735
+ # if df is not None:
3736
+ # st.write(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
3737
+ # else:
3738
+ # st.error("Dataset is loaded but appears to be empty.")
3739
+ # st.markdown('</div>', unsafe_allow_html=True)
3740
+
3741
+ # # Column filters
3742
+ # st.markdown('<div class="sidebar-section">', unsafe_allow_html=True)
3743
+ # st.markdown('### Column Filters')
3744
+ # if df is not None:
3745
+ # selected_columns = st.multiselect("Select columns to analyze",
3746
+ # options=df.columns.tolist(),
3747
+ # default=df.columns.tolist())
3748
+
3749
+ # if len(selected_columns) > 0:
3750
+ # st.session_state['selected_columns'] = selected_columns
3751
+ # st.session_state['filtered_df'] = df[selected_columns]
3752
+ # else:
3753
+ # st.session_state['selected_columns'] = df.columns.tolist()
3754
+ # st.session_state['filtered_df'] = df
3755
+ # st.markdown('</div>', unsafe_allow_html=True)
3756
+
3757
+ # # Feature Engineering options with Streamlit buttons instead of JavaScript
3758
+ # st.markdown('<div class="sidebar-section">', unsafe_allow_html=True)
3759
+ # st.markdown('### Feature Engineering')
3760
+
3761
+ # col1, col2 = st.columns(2)
3762
+ # with col1:
3763
+ # if st.button("Missing Values", key="missing_values_btn"):
3764
+ # st.session_state['fe_selected'] = 'missing_values'
3765
+
3766
+ # with col2:
3767
+ # if st.button("Encode Categorical", key="encode_cat_btn"):
3768
+ # st.session_state['fe_selected'] = 'encode_categorical'
3769
+
3770
+ # col1, col2 = st.columns(2)
3771
+ # with col1:
3772
+ # if st.button("Scale Features", key="scale_features_btn"):
3773
+ # st.session_state['fe_selected'] = 'scale_features'
3774
+
3775
+ # with col2:
3776
+ # if st.button("Transform", key="transform_btn"):
3777
+ # st.session_state['fe_selected'] = 'transform'
3778
+
3779
+ # # Display currently selected feature engineering option
3780
+ # if 'fe_selected' in st.session_state:
3781
+ # st.info(f"Selected: {st.session_state['fe_selected']}")
3782
+
3783
+ # st.markdown('</div>', unsafe_allow_html=True)
3784
+
3785
+ # # st.markdown('<div class="sidebar-footer">Powered by Hugging Face & Streamlit</div>', unsafe_allow_html=True)
3786
+
3787
+ # # If data is uploaded, process it
3788
+ # if uploaded_file is not None and ('df' not in st.session_state or st.session_state.get('df') is None):
3789
+ # try:
3790
+ # # Attempt to read the CSV file
3791
+ # df = pd.read_csv(uploaded_file)
3792
+
3793
+ # # Verify that we have a valid dataframe before storing in session state
3794
+ # if df is not None and not df.empty:
3795
+ # st.session_state['df'] = df
3796
+ # st.session_state['descriptive_stats'] = df.describe()
3797
+ # st.session_state['dataset_name'] = uploaded_file.name
3798
+ # st.success(f"Successfully loaded dataset: {uploaded_file.name}")
3799
+ # else:
3800
+ # st.error("The uploaded file appears to be empty.")
3801
+ # except Exception as e:
3802
+ # st.error(f"Error reading CSV file: {str(e)}")
3803
+
3804
+ # # Create navigation tabs using Streamlit
3805
+ # st.write("### Navigation")
3806
+ # tabs = ["Overview", "Distribution", "Relationships", "AI Insights", "Chat"]
3807
+
3808
+ # # Create columns for each tab
3809
+ # cols = st.columns(len(tabs))
3810
+
3811
+ # # Handle tab selection using Streamlit buttons
3812
+ # for i, tab in enumerate(tabs):
3813
+ # with cols[i]:
3814
+ # if st.button(tab, key=f"tab_{tab.lower()}"):
3815
+ # st.session_state['selected_tab'] = f"tab-{tab.lower().replace(' ', '-')}"
3816
+ # st.rerun()
3817
+
3818
+ # # Show selected tab indicator
3819
+ # selected_tab_name = st.session_state['selected_tab'].replace('tab-', '').replace('-', ' ').title()
3820
+ # st.markdown(f"<div style='text-align: center; margin-bottom: 2rem;'>Selected: {selected_tab_name}</div>", unsafe_allow_html=True)
3821
+
3822
+ # # Show welcome message if no data is uploaded
3823
+ # if 'df' not in st.session_state:
3824
+ # display_welcome_page()
3825
+ # else:
3826
+ # # Display content based on selected tab
3827
+ # if st.session_state['selected_tab'] == 'tab-overview':
3828
+ # display_descriptive_tab()
3829
+ # elif st.session_state['selected_tab'] == 'tab-distribution':
3830
+ # display_distribution_tab()
3831
+ # elif st.session_state['selected_tab'] == 'tab-relationships':
3832
+ # display_relationships_tab()
3833
+ # elif st.session_state['selected_tab'] == 'tab-ai-insights' or st.session_state['selected_tab'] == 'tab-ai':
3834
+ # display_ai_insights_tab()
3835
+ # elif st.session_state['selected_tab'] == 'tab-chat':
3836
+ # display_chat_interface()
3837
+
3838
+ # # After all tabs are rendered, check if we have a regenerate action
3839
+ # # This is processed at the end to avoid session state changes during rendering
3840
+ # if (st.session_state.get('loading_insights', False) and
3841
+ # ('ai_insights' not in st.session_state or st.session_state.get('ai_insights') is None)):
3842
+ # logger.info("Generating AI insights at end of main function")
3843
+ # try:
3844
+ # st.session_state['ai_insights'] = generate_ai_insights()
3845
+ # logger.info(f"Generated insights: {len(st.session_state['ai_insights'])} categories")
3846
+ # st.session_state['loading_insights'] = False
3847
+ # except Exception as e:
3848
+ # logger.error(f"Error generating insights in main function: {str(e)}")
3849
+ # st.session_state['loading_insights'] = False
3850
+ # st.session_state['ai_insights'] = {} # Set to empty dict to prevent repeated failures
3851
+ # finally:
3852
+ # st.rerun()
3853
+
3854
+ # if __name__ == "__main__":
3855
+ # main()