File size: 4,581 Bytes
7bed57b
45b1647
be9c947
45b1647
 
 
 
 
 
 
 
8aaa55b
 
 
 
 
45b1647
7bed57b
45b1647
 
 
7bed57b
45b1647
 
 
 
 
7bed57b
45b1647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db219ff
 
 
45b1647
7bed57b
1c056b2
45b1647
 
 
 
 
 
 
 
 
7bed57b
45b1647
7bed57b
45b1647
 
 
 
 
 
 
 
7bed57b
45b1647
7bed57b
 
 
 
 
45b1647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bed57b
45b1647
 
 
 
 
 
 
 
 
 
 
 
 
 
7bed57b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import tempfile
import hashlib
import streamlit as st
from langchain.llms import HuggingFaceHub
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from streamlit_pdf_viewer import pdf_viewer
import dotenv
dotenv.load_dotenv()
import os

token = os.getenv("Token")

def init_page() -> None:
    st.set_page_config(page_title="PDF Chatbot")
    st.subheader("๐Ÿ’ฌ PDF Chat with multi LLMs")

def init_messages() -> None:
    if "messages" not in st.session_state:
        st.session_state.messages = [
            SystemMessage(content="You are a helpful AI assistant. Reply in markdown format.")
        ]

def main() -> None:
    init_page()
    init_messages()

    # Initialize session state variables
    if 'vectorstore' not in st.session_state:
        st.session_state.vectorstore = None
    if 'current_file_hash' not in st.session_state:
        st.session_state.current_file_hash = None

    # Sidebar: LLM selection and PDF file uploader
    with st.sidebar:
        st.title("Options")
        selected_model = st.selectbox(
            "Select LLM",
            options=[
                "deepseek-ai/DeepSeek-V3",
                "Qwen/Qwen2.5-7B-Instruct",
                "meta-llama/Llama-3.1-8B-Instruct",
                "mistralai/Mistral-7B-Instruct-v0.3",
                "bigscience/bloom",
                "google/flan-t5-xxl"
            ],
            index=0,
            key="selected_model"
        )
        uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
        if uploaded_file:
            binary_data = uploaded_file.getvalue()
            pdf_viewer(input=binary_data, width=300)

    # Initialize LLM with API token from Spaces secrets
    llm = HuggingFaceHub(
        repo_id=st.session_state.selected_model,
        model_kwargs={"temperature": 0.5, "max_length": 500},
        huggingfacehub_api_token=token,
    )

    if uploaded_file:
        # Compute file hash to check for changes
        file_hash = hashlib.md5(uploaded_file.getvalue()).hexdigest()

        # Process file using temporary file
        if st.session_state.current_file_hash != file_hash or st.session_state.vectorstore is None:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
                tmp.write(uploaded_file.getbuffer())
                loader = PyPDFLoader(tmp.name)
                pages = loader.load()
            
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            texts = text_splitter.split_documents(pages)
            embeddings = HuggingFaceEmbeddings()
            vectorstore = FAISS.from_documents(texts, embeddings)
            st.session_state.vectorstore = vectorstore
            st.session_state.current_file_hash = file_hash

        # Chat interface
        if user_input := st.chat_input("Input your question about the PDF:"):
            st.session_state.messages.append(HumanMessage(content=user_input))
            with st.spinner("Analyzing ..."):
                try:
                    qa_chain = RetrievalQA.from_chain_type(
                        llm=llm,
                        chain_type="stuff",
                        retriever=st.session_state.vectorstore.as_retriever()
                    )
                    answer = qa_chain.run(user_input)
                except Exception as e:
                    answer = f"An error occurred: {str(e)}"
            st.session_state.messages.append(AIMessage(content=answer))

        # Display chat messages
        for message in st.session_state.get("messages", []):
            if isinstance(message, AIMessage):
                with st.chat_message("assistant", avatar="๐Ÿ‘ฝ"):
                    st.markdown(message.content)
            elif isinstance(message, HumanMessage):
                with st.chat_message("user", avatar="๐Ÿ™‹โ€โ™‚๏ธ"):
                    st.markdown(message.content)

        # Clear conversation button
        if st.button("๐Ÿงน Clear Conversation", key="clear_chat"):
            st.session_state.messages = [
                SystemMessage(content="You are a helpful AI assistant. Reply in markdown format.")
            ]
            st.rerun()
    else:
        st.write("Please upload a PDF file to start querying.")

if __name__ == "__main__":
    main()