import streamlit as st from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import textwrap # Streamlit sidebar for file upload and chunk size slider st.sidebar.title("Upload your text file") uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"]) # Slider for chunk size selection chunk_size = st.sidebar.slider("Select chunk size", min_value=100, max_value=500, step=100, value=300) if uploaded_file: # Read the text file content text_data = uploaded_file.read().decode("utf-8") # Split the text into chunks based on the selected chunk size sentences = textwrap.wrap(text_data, chunk_size) # Initialize the TF-IDF Vectorizer vectorizer = TfidfVectorizer().fit(sentences) vectors = vectorizer.transform(sentences) # Keep it sparse def get_top_responses(user_query, top_n=5): # Transform user query and keep the result sparse user_vector = vectorizer.transform([user_query]) # Compute cosine similarity directly with sparse matrices similarities = cosine_similarity(user_vector, vectors).flatten() # Get indices of top N similar sentences top_indices = similarities.argsort()[-top_n:][::-1] # Return top N most similar chunks return [sentences[i] for i in top_indices] # Streamlit chat elements st.title("TF-IDF Chatbot") # Chat history if "messages" not in st.session_state: st.session_state.messages = [] # Chat input box user_input = st.chat_input("Ask me anything") # Handle user input if user_input: # Store the user message in the session st.session_state.messages.append({"role": "user", "content": user_input}) # Get the top bot responses responses = get_top_responses(user_input) # Store the bot responses in the session for response in responses: st.session_state.messages.append({"role": "bot", "content": response}) # Display the chat history for message in st.session_state.messages: with st.chat_message(message["role"]): st.write(message["content"])