File size: 2,237 Bytes
5311203
 
 
55743e6
5311203
55743e6
819112e
 
5311203
55743e6
 
5311203
819112e
 
 
7c2d6e5
55743e6
 
 
819112e
 
 
 
 
 
 
 
 
 
 
 
 
 
55743e6
819112e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import streamlit as st
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import textwrap

# Streamlit sidebar for file upload and chunk size slider
st.sidebar.title("Upload your text file")
uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"])

# Slider for chunk size selection
chunk_size = st.sidebar.slider("Select chunk size", min_value=100, max_value=500, step=100, value=300)

if uploaded_file:
    # Read the text file content
    text_data = uploaded_file.read().decode("utf-8")
    
    # Split the text into chunks based on the selected chunk size
    sentences = textwrap.wrap(text_data, chunk_size)
    
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer().fit(sentences)
    vectors = vectorizer.transform(sentences)  # Keep it sparse

    def get_top_responses(user_query, top_n=5):
        # Transform user query and keep the result sparse
        user_vector = vectorizer.transform([user_query])
        
        # Compute cosine similarity directly with sparse matrices
        similarities = cosine_similarity(user_vector, vectors).flatten()
        
        # Get indices of top N similar sentences
        top_indices = similarities.argsort()[-top_n:][::-1]
        
        # Return top N most similar chunks
        return [sentences[i] for i in top_indices]

    # Streamlit chat elements
    st.title("TF-IDF Chatbot")

    # Chat history
    if "messages" not in st.session_state:
        st.session_state.messages = []

    # Chat input box
    user_input = st.chat_input("Ask me anything")

    # Handle user input
    if user_input:
        # Store the user message in the session
        st.session_state.messages.append({"role": "user", "content": user_input})
        
        # Get the top bot responses
        responses = get_top_responses(user_input)
        
        # Store the bot responses in the session
        for response in responses:
            st.session_state.messages.append({"role": "bot", "content": response})

    # Display the chat history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.write(message["content"])