Spaces:

1MR
/

ARAG

Sleeping

App Files Files Community

1MR commited on Dec 22, 2024

Commit

27b7701

verified ·

1 Parent(s): 55ed2e6

Upload 8 files

Browse files

Files changed (8) hide show

Information.py +61 -0
Main.py +122 -0
Preprocessing1.py +145 -0
Preprocessing2.py +217 -0
RAG.py +222 -0
Rag.txt +28 -0
Virtualization.py +75 -0
tempCodeRunnerFile.py +1 -0

Information.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import io
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder
+import seaborn as sns
+import base64
+def show_general_data_statistics():
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        num_var = len(data.columns)
+        num_rows = len(data)
+        missing_cells = data.isnull().sum().sum()
+        missing_cells_percent = (missing_cells / (data.size)) * 100
+        duplicate_rows = data.duplicated().sum()
+        duplicate_rows_percent = (duplicate_rows / num_rows) * 100
+        var_types = data.dtypes.value_counts()
+        st.write("### General Data Statistics:")
+        st.write(f"- **Number of Variables:**   {num_var}")
+        st.write(f"- **Number of Rows:**    {num_rows}")
+        st.write(f"- **Missing Cells:**     {missing_cells}")
+        st.write(f"- **Missing Cells (%):**     {missing_cells_percent:.2f}%")
+        st.write(f"- **Duplicate Rows:**    {duplicate_rows}")
+        st.write(f"- **Duplicate Rows (%):**    {duplicate_rows_percent:.2f}%")
+        st.write("#### Variable Types:")
+        st.write(var_types)
+    else:
+        st.warning("Please upload a dataset first.")
+def describe_data():
+    st.title("Describe Data")
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        st.write("Dataset Description:")
+        st.write(data.describe())
+    else:
+        st.warning("Please upload a dataset first.")
+def info_data():
+    st.title("Dataset Info")
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        buffer = io.StringIO()
+        data.info(buf=buffer)
+        info = buffer.getvalue()
+        st.text(info)
+    else:
+        st.warning("Please upload a dataset first.")

Main.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import io
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder
+import seaborn as sns
+import base64
+import json
+from langchain.docstore.document import Document
+from langchain.vectorstores import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.llms import HuggingFaceHub
+from langchain.chains import RetrievalQA
+from Information import show_general_data_statistics, describe_data, info_data
+from Preprocessing1 import preview_data, data_cleaning, modify_column_names
+from Preprocessing2 import handle_categorical_values, missing_values, handle_duplicates, handle_outliers
+from Virtualization import visualize_data
+def upload_data():
+    st.title("Upload Dataset")
+    file = st.file_uploader("Upload your dataset", type=[
+                            "csv", "xlsx"], key="file_uploader_1")
+    if file:
+        try:
+            if file.name.endswith(".csv"):
+                data = pd.read_csv(file)
+            elif file.name.endswith(".xlsx"):
+                data = pd.read_excel(file)
+            st.session_state["data"] = data
+            st.success("Dataset uploaded successfully!")
+        except Exception as e:
+            st.error(f"Error loading file: {e}")
+    return file
+def download_data():
+    """Downloads the DataFrame as a CSV file."""
+    if "data" in st.session_state and not st.session_state["data"].empty:
+        csv = st.session_state["data"].to_csv(index=False).encode('utf-8')
+        download_button = st.download_button(
+            label="Download Cleaned Dataset",
+            data=csv,
+            file_name="cleaned_data.csv",
+            mime="text/csv"
+        )
+        if download_button:
+            st.balloons()
+            st.success("Dataset is ready for download!")
+    else:
+        st.warning(
+            "No data available to download. Please modify or upload a dataset first.")
+def rag_chatbot():
+    pass
+def main():
+    st.sidebar.title("Navigation")
+    options = st.sidebar.radio(
+        "Go to",
+        [
+            "Upload",
+            "Preview",
+            "Data Cleaning",
+            "Modify Column Names",
+            "General Data Statistics",
+            "Describe",
+            "Info",
+            "Handle Categorical",
+            "Missing Values",
+            "Handle Duplicates",
+            "Handle Outliers",
+            "Visualize Data",
+            "Download",
+            "RAG Chatbot"
+        ],
+        key="unique_navigation_key",
+    )
+    if options == "Upload":
+        upload_data()
+    elif options == "Preview":
+        preview_data()
+    elif options == "Data Cleaning":
+        data_cleaning()
+    elif options == "Modify Column Names":
+        modify_column_names()
+    elif options == "General Data Statistics":
+        show_general_data_statistics()
+    elif options == "Describe":
+        describe_data()
+    elif options == "Info":
+        info_data()
+    elif options == "Handle Categorical":
+        handle_categorical_values()
+    elif options == "Missing Values":
+        missing_values()
+    elif options == "Handle Duplicates":
+        handle_duplicates()
+    elif options == "Handle Outliers":
+        handle_outliers()
+    elif options == "Visualize Data":
+        visualize_data()
+    elif options == "Download":
+        download_data()
+    elif options == "RAG Chatbot":
+        rag_chatbot()
+    else:
+        st.warning("Please upload a dataset first.")
+if __name__ == "__main__":
+    main()

Preprocessing1.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import io
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder
+import seaborn as sns
+import base64
+def preview_data():
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        st.write("### Dataset Preview Options:")
+        preview_option = st.radio(
+            "Select how to preview the dataset:",
+            options=["Head", "Tail", "Custom Number of Rows"],
+            index=0
+        )
+        if preview_option == "Head":
+            st.write("### First 5 Rows of the Dataset:")
+            st.dataframe(data.head())
+        elif preview_option == "Tail":
+            st.write("### Last 5 Rows of the Dataset:")
+            st.dataframe(data.tail())
+        elif preview_option == "Custom Number of Rows":
+            number = st.slider(
+                "Select Number of Rows to Display:", 1, len(data))
+            st.write(f"### First {number} Rows of the Dataset:")
+            st.dataframe(data.head(number))
+        # Show entire data
+        if st.checkbox("Show all data"):
+            st.write(data)
+        # Show column names
+        if st.checkbox("Show Column Names"):
+            st.write(data.columns)
+        # Show dataset dimensions (rows and columns)
+        if st.checkbox("Show Dimensions"):
+            st.write(data.shape)
+    else:
+        st.warning("Please upload a dataset to view options.")
+def data_cleaning():
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        st.subheader("Data Cleaning")
+        col_option = st.selectbox("Choose your option", [
+                                  "Check all numeric features are numeric?", "Show unique values of categorical features"])
+        # Check and convert numeric columns
+        if col_option == "Check all numeric features are numeric?":
+            st.write("Converting all numeric columns to numeric types...")
+            numeric_columns = list(
+                data.select_dtypes(include=np.number).columns)
+            for col in numeric_columns:
+                data[col] = pd.to_numeric(data[col], errors='coerce')
+            st.success("Done!")
+        # Show unique values for categorical features
+        elif col_option == "Show unique values of categorical features":
+            st.write("Unique values for categorical features:")
+            for column in data.columns:
+                # check for categorical features (strings)
+                if data[column].dtype == object:
+                    st.write(f"{column}: {data[column].unique()}")
+            st.write("====================================")
+    else:
+        st.warning("Please upload a dataset to perform data cleaning.")
+def modify_column_names():
+    st.title("Modify Column Names")
+    # Ensure data exists in the session
+    if "data" in st.session_state:
+        df = st.session_state["data"]
+        # Ensure modified_columns is initialized in session state
+        if "modified_columns" not in st.session_state:
+            st.session_state.modified_columns = list(df.columns)
+        st.write('### *Current Column Names*')
+        st.table(df.columns)
+        st.write('### *Modify Column Names*')
+        with st.expander("Modify Column Names", expanded=True):
+            # Use the modified columns from session state
+            before_col = st.session_state.modified_columns
+            before_col_df = pd.DataFrame(before_col, columns=['Column Name'])
+            st.table(before_col_df)
+            col3, col4, col5, col6 = st.columns(4)
+            changes_made = False  # Flag to track if any change is made
+            if st.button('Convert to Uppercase'):
+                st.session_state.modified_columns = [
+                    col.upper() for col in before_col]
+                changes_made = True
+            if st.button('Convert to Lowercase'):
+                st.session_state.modified_columns = [
+                    col.lower() for col in before_col]
+                changes_made = True
+            if st.button('Replace Spaces with Underscore'):
+                st.session_state.modified_columns = [
+                    col.replace(" ", "_") for col in before_col]
+                changes_made = True
+            if st.button('Capitalize First Letters'):
+                st.session_state.modified_columns = [
+                    col.title() for col in before_col]
+                changes_made = True
+            # Apply the changes only if a change was made
+            if changes_made:
+                df.columns = st.session_state.modified_columns
+                st.success("Changes applied successfully.")
+                st.table(pd.DataFrame(
+                    df.columns, columns=['Modified Columns']))
+        st.write("### *Modify a Specific Column Name*")
+        column_select = st.selectbox(
+            'Select column to modify', options=st.session_state.modified_columns)
+        new_column_name = st.text_input('Enter new column name')
+        if st.button('Update Column Name'):
+            if column_select and new_column_name:
+                st.session_state.modified_columns = [
+                    new_column_name if col == column_select else col for col in st.session_state.modified_columns]
+                df.columns = st.session_state.modified_columns
+                st.success("Column name updated.")
+                st.table(pd.DataFrame(
+                    df.columns, columns=['Modified Columns']))
+    else:
+        st.warning("Please upload a dataset first.")

Preprocessing2.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import io
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder
+import seaborn as sns
+import base64
+def handle_categorical_values():
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        st.subheader("Handle Categorical Values")
+        categorical_cols_features = list(
+            data.select_dtypes(include="object").columns)
+        # One-Hot Encoding for nominal categorical features
+        one_hot_enc = st.multiselect(
+            "Select nominal categorical columns", categorical_cols_features)
+        # Apply one-hot encoding to selected columns
+        if one_hot_enc:
+            for column in one_hot_enc:
+                if data[column].dtype == 'object':  # Only apply to categorical/string columns
+                    data = pd.get_dummies(data, columns=[column])
+            st.write("### Data after One-Hot Encoding:")
+            st.write(data.head())
+        # Label Encoding for ordinal categorical features
+        label_encoder = LabelEncoder()
+        label_enc = st.multiselect(
+            "Select ordinal categorical columns", categorical_cols_features)
+        # Apply label encoding to selected columns
+        if label_enc:
+            for column in label_enc:
+                if data[column].dtype == 'object':  # Only apply to categorical/string columns
+                    data[column] = label_encoder.fit_transform(data[column])
+            st.write("### Data after Label Encoding:")
+            st.write(data.head())
+    else:
+        st.warning("Please upload a dataset to handle categorical values.")
+def missing_values():
+    st.title("Handle Missing Values")
+    if "data" in st.session_state:
+        data = st.session_state["data"].copy()
+        action = st.selectbox(
+            "Select Action", ["Drop", "Dropna", "Fill missing val"])
+        column = st.selectbox("Select Column", data.columns)
+        st.write("### Before:")
+        st.dataframe(data)
+        modified_data = data.copy()
+        if action == "Drop":
+            modified_data.drop(columns=[column], inplace=True)
+        elif action == "Dropna":
+            modified_data.dropna(subset=[column], inplace=True)
+        elif action == "Fill missing val":
+            fill_method = st.selectbox(
+                "Select fill method", ["Mean", "Mode", "Median"])
+            if fill_method == "Mean":
+                fill_value = data[column].mean()
+            elif fill_method == "Mode":
+                fill_value = data[column].mode()[0]
+            elif fill_method == "Median":
+                fill_value = data[column].median()
+            modified_data[column].fillna(fill_value, inplace=True)
+        st.write("### After (Preview):")
+        st.dataframe(modified_data)
+        if st.button("OK"):
+            st.session_state["data"] = modified_data
+            st.success("Done! The action has been applied.")
+            st.write("### After:")
+            st.dataframe(modified_data)
+    else:
+        st.warning("Please upload a dataset first.")
+def handle_duplicates():
+    st.title("Handle Duplicates")
+    if "data" in st.session_state:
+        data = st.session_state["data"].copy()
+        action = st.selectbox(
+            "Select Action", ["Drop Duplicates", "Drop Duplicates in Column", "Keep First", "Keep Last"])
+        if action in ["Drop Duplicates in Column", "Keep First", "Keep Last"]:
+            column = st.selectbox("Select Column", data.columns)
+        else:
+            column = None
+        st.write("### Before:")
+        st.dataframe(data)
+        after_placeholder = st.empty()
+        modified_data = data.copy()
+        if action == "Drop Duplicates":
+            modified_data.drop_duplicates(inplace=True)
+        elif action == "Drop Duplicates in Column":
+            modified_data.drop_duplicates(subset=[column], inplace=True)
+        elif action == "Keep First":
+            modified_data.drop_duplicates(
+                subset=[column], keep="first", inplace=True)
+        elif action == "Keep Last":
+            modified_data.drop_duplicates(
+                subset=[column], keep="last", inplace=True)
+        st.write("### After (Preview):")
+        st.dataframe(modified_data)
+        if st.button("OK"):
+            st.session_state["data"] = modified_data
+            st.success("Done! The action has been applied.")
+            st.write("### After:")
+            st.dataframe(modified_data)
+    else:
+        st.warning("Please upload a dataset first.")
+def handle_outliers():
+    st.title("Handle Outliers")
+    if "data" in st.session_state:
+        data = st.session_state["data"].copy()
+        column = st.selectbox("Select Column", data.select_dtypes(
+            include=[np.number]).columns)
+        action = st.selectbox(
+            "Select Action",
+            ["Remove Outliers (IQR)", "Set Bounds Manually",
+             "Replace Outliers"]
+        )
+        st.write("### Before:")
+        st.dataframe(data)
+        after_placeholder = st.empty()
+        modified_data = data.copy()
+        if action == "Remove Outliers (IQR)":
+            Q1 = data[column].quantile(0.25)
+            Q3 = data[column].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            # Remove outliers
+            modified_data = modified_data[(
+                modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
+        elif action == "Set Bounds Manually":
+            # User inputs for bounds
+            lower_bound = st.number_input(
+                f"Set lower bound for {column}", value=float(data[column].min()))
+            upper_bound = st.number_input(
+                f"Set upper bound for {column}", value=float(data[column].max()))
+            modified_data = modified_data[(
+                modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
+        elif action == "Replace Outliers":
+            Q1 = data[column].quantile(0.25)
+            Q3 = data[column].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            replace_method = st.radio(
+                "Select Replacement Method",
+                ["Mean", "Median"]
+            )
+            if replace_method == "Mean":
+                replacement_value = data[column].mean()
+            else:
+                replacement_value = data[column].median()
+            # Replace outliers
+            modified_data[column] = modified_data[column].apply(
+                lambda x: replacement_value if x < lower_bound or x > upper_bound else x
+            )
+        # After Visualization
+        st.write("### After (Preview):")
+        st.dataframe(modified_data)
+        if st.button("OK"):
+            st.session_state["data"] = modified_data
+            st.success("Done! The action has been applied.")
+            st.write("### After:")
+            st.dataframe(modified_data)
+    else:
+        st.warning("Please upload a dataset first.")

RAG.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import streamlit as st
+import pandas as pd
+from langchain.document_loaders import DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.llms import Ollama
+from langchain.vectorstores import FAISS
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from sentence_transformers import SentenceTransformer, util
+from langchain.schema import Document
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain.chains import create_history_aware_retriever
+from langchain_huggingface import HuggingFaceEmbeddings
+bot_template = '''
+<div style="display: flex; align-items: center; margin-bottom: 10px; background-color: #B22222; padding: 10px; border-radius: 10px; border: 1px solid #7A0000;">
+    <div style="flex-shrink: 0; margin-right: 10px;">
+        <img src="https://raw.githubusercontent.com/AalaaAyman24/Test/main/chatbot.png"
+             style="max-height: 50px; max-width: 50px; object-fit: cover;">
+    </div>
+    <div style="background-color: #B22222; color: white; padding: 10px; border-radius: 10px; max-width: 75%; word-wrap: break-word; overflow-wrap: break-word;">
+        {msg}
+    </div>
+</div>
+'''
+user_template = '''
+<div style="display: flex; align-items: center; margin-bottom: 10px; justify-content: flex-end;">
+    <div style="flex-shrink: 0; margin-left: 10px;">
+        <img src="https://raw.githubusercontent.com/AalaaAyman24/Test/main/question.png"
+             style="max-height: 50px; max-width: 50px; border-radius: 50%; object-fit: cover;">
+    </div>
+    <div style="background-color: #757882; color: white; padding: 10px; border-radius: 10px; max-width: 75%; word-wrap: break-word; overflow-wrap: break-word;">
+        {msg}
+    </div>
+</div>
+'''
+button_style = """
+<style>
+    .small-button {
+        display: inline-block;
+        padding: 5px 10px;
+        font-size: 12px;
+        color: white;
+        background-color: #007bff;
+        border: none;
+        border-radius: 5px;
+        cursor: pointer;
+        margin-right: 5px;
+    }
+    .small-button:hover {
+        background-color: #0056b3;
+    }
+    .chat-box {
+        position: fixed;
+        bottom: 20px;
+        width: 100%;
+        left: 0;
+        padding: 20px;
+        background-color: #f1f1f1;
+        border-radius: 10px;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+    }
+</style>
+"""
+# Function to prepare and split documents from CSV or Excel
+def prepare_and_split_docs(files):
+    split_docs = []
+    for file in files:
+        # Read the file with pandas based on the extension
+        if file.name.endswith('.csv'):
+            df = pd.read_csv(file)
+        elif file.name.endswith('.xlsx'):
+            df = pd.read_excel(file)
+        # Convert dataframe to text for document splitting (this could vary based on the structure of the data)
+        # Convert the whole dataframe to string without index
+        text = df.to_string(index=False)
+        # Wrap the string into a Document object
+        document = Document(page_content=text, metadata={"source": file.name})
+        # Create the splitter and split the document
+        splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            chunk_size=512,
+            chunk_overlap=256,
+            disallowed_special=(),
+            separators=["\n\n", "\n", " "]
+        )
+        split_docs.extend(splitter.split_documents([document]))
+    return split_docs
+# Function to ingest documents into the vector database
+def ingest_into_vectordb(split_docs):
+    embeddings = HuggingFaceEmbeddings(
+        model_name='sentence-transformers/all-MiniLM-L6-v2')
+    db = FAISS.from_documents(split_docs, embeddings)
+    DB_FAISS_PATH = 'vectorstore/db_faiss'
+    db.save_local(DB_FAISS_PATH)
+    return db
+# Function to get the conversation chain
+def get_conversation_chain(retriever):
+    llm = Ollama(model="llama3.2:1b")
+    contextualize_q_system_prompt = (
+        "Given the chat history and the latest user question, "
+        "provide a response that directly addresses the user's query based on the provided documents. "
+        "Do not rephrase the question or ask follow-up questions."
+    )
+    contextualize_q_prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", contextualize_q_system_prompt),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}"),
+        ]
+    )
+    history_aware_retriever = create_history_aware_retriever(
+        llm, retriever, contextualize_q_prompt
+    )
+    system_prompt = (
+        "As a personal chat assistant, provide accurate and relevant information based on the provided document in 2-3 sentences. "
+        "Answer should be limited to 50 words and 2-3 sentences. Do not prompt to select answers or formulate a stand-alone question."
+        "{context}"
+    )
+    qa_prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", system_prompt),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}"),
+        ]
+    )
+    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
+    rag_chain = create_retrieval_chain(
+        history_aware_retriever, question_answer_chain)
+    store = {}
+    def get_session_history(session_id: str) -> BaseChatMessageHistory:
+        if session_id not in store:
+            store[session_id] = ChatMessageHistory()
+        return store[session_id]
+    conversational_rag_chain = RunnableWithMessageHistory(
+        rag_chain,
+        get_session_history,
+        input_messages_key="input",
+        history_messages_key="chat_history",
+        output_messages_key="answer",
+    )
+    return conversational_rag_chain
+def calculate_similarity_score(answer: str, context_docs: list) -> float:
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    context_docs = [doc.page_content for doc in context_docs]
+    answer_embedding = model.encode(answer, convert_to_tensor=True)
+    context_embeddings = model.encode(context_docs, convert_to_tensor=True)
+    similarities = util.pytorch_cos_sim(answer_embedding, context_embeddings)
+    max_score = similarities.max().item()
+    return max_score
+st.title("What can I help with⁉️")
+# Sidebar for file upload
+uploaded_files = st.sidebar.file_uploader(
+    "Upload CSV/Excel Documents", type=["csv", "xlsx"], accept_multiple_files=True)
+if uploaded_files:
+    if st.sidebar.button("Process Documents"):
+        split_docs = prepare_and_split_docs(uploaded_files)
+        vector_db = ingest_into_vectordb(split_docs)
+        retriever = vector_db.as_retriever()
+        st.sidebar.success("Documents processed and vector database created!")
+        # Initialize the conversation chain
+        conversational_chain = get_conversation_chain(retriever)
+        st.session_state.conversational_chain = conversational_chain
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+# Chat input
+st.markdown(button_style, unsafe_allow_html=True)
+user_input = st.text_input("Ask a question about the dataset:",
+                           key="user_input", placeholder="Type your question here...")
+if st.button("Submit"):
+    st.markdown(button_style, unsafe_allow_html=True)
+    if user_input and 'conversational_chain' in st.session_state:
+        session_id = "abc123"
+        conversational_chain = st.session_state.conversational_chain
+        response = conversational_chain.invoke({"input": user_input}, config={
+                                               "configurable": {"session_id": session_id}})
+        context_docs = response.get('context', [])
+        st.session_state.chat_history.append(
+            {"user": user_input, "bot": response['answer'], "context_docs": context_docs})
+# Display chat history
+if st.session_state.chat_history:
+    for message in st.session_state.chat_history:
+        st.markdown(user_template.format(
+            msg=message['user']), unsafe_allow_html=True)
+        st.markdown(bot_template.format(
+            msg=message['bot']), unsafe_allow_html=True)

Rag.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+def rag_chatbot():
+    st.title("RAG Chatbot")
+    # Check if data is uploaded
+    if "data" in st.session_state and isinstance(st.session_state["data"], pd.DataFrame):
+        df = st.session_state["data"]
+        # Convert data to documents
+        st.write("Processing the dataset...")
+        documents = create_doucment(df)
+        st.write(f"Created {len(documents)} documents.")
+        # Load models
+        st.write("Loading models...")
+        embedding = load_models_embedding()
+        llm = load_models_llm()
+        # Create retriever
+        retriever = create_database(embedding, documents).as_retriever()
+        # Ask a question
+        question = st.text_input("Ask a question about your dataset:")
+        if question:
+            response = ask_me(question, retriever, llm)
+            st.write(f"Answer: {response}")
+    else:
+        st.warning("Please upload a dataset to proceed.")

Virtualization.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+def visualize_data():
+    st.title("Data Visualization")
+    if "data" in st.session_state:
+        df = st.session_state["data"]
+        chart_type = st.selectbox("Choose Chart Type", [
+                                  "Bar Chart", "Histogram", "Boxplot", "Doughnut Chart", "Pie Chart"])
+        columns = df.select_dtypes(include=['number']).columns.tolist()
+        selected_column = st.selectbox("Select Column", columns)
+        value_counts = df[selected_column].value_counts()
+        if chart_type == "Bar Chart":
+            if len(value_counts) > 20:
+                st.warning(
+                    "Bar Chart is not suitable for more than 20 unique values. Please select a column with 20 or fewer unique values.")
+            else:
+                st.subheader(f"Bar Chart for {selected_column}")
+                fig, ax = plt.subplots()
+                df[selected_column].value_counts().plot(kind='bar', ax=ax)
+                st.pyplot(fig)
+        elif chart_type == "Histogram":
+            if len(value_counts) < 10:
+                st.warning(
+                    "Histogram requires at least 10 unique values to be meaningful. Please select a column with more than 10 unique values.")
+            else:
+                st.subheader(f"Histogram for {selected_column}")
+                fig, ax = plt.subplots()
+                ax.hist(df[selected_column], bins=20, edgecolor="black")
+                ax.set_xlabel(selected_column)
+                ax.set_ylabel('Frequency')
+                st.pyplot(fig)
+        elif chart_type == "Boxplot":
+            if len(value_counts) < 5:
+                st.warning(
+                    "Boxplot requires at least 5 unique values to show distribution. Please select a column with more than 5 unique values.")
+            else:
+                st.subheader(f"Boxplot for {selected_column}")
+                fig = plt.figure(figsize=(6, 4))
+                sns.boxplot(x=df[selected_column])
+                st.pyplot(fig)
+        elif chart_type == "Doughnut Chart":
+            if len(value_counts) > 5:
+                st.warning(
+                    "Doughnut Chart is not suitable for more than 5 unique values. Please select a column with 5 or fewer unique values.")
+            else:
+                st.subheader(f"Doughnut Chart for {selected_column}")
+                fig = px.pie(value_counts, names=value_counts.index,
+                             values=value_counts.values, hole=0.3)
+                st.plotly_chart(fig)
+        elif chart_type == "Pie Chart":
+            if len(value_counts) > 5:
+                st.warning(
+                    "Pie Chart is not suitable for more than 5 unique values. Please select a column with 5 or fewer unique values.")
+            else:
+                st.subheader(f"Pie Chart for {selected_column}")
+                fig = px.pie(value_counts, names=value_counts.index,
+                             values=value_counts.values)
+                st.plotly_chart(fig)
+    else:
+        st.warning("Please upload a dataset first.")

tempCodeRunnerFile.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ plotly.express