| | import streamlit as st |
| | from dotenv import load_dotenv |
| | from PyPDF2 import PdfReader |
| | from langchain.text_splitter import CharacterTextSplitter |
| | from langchain.embeddings import HuggingFaceEmbeddings |
| | from langchain.vectorstores import FAISS |
| | |
| | from langchain.memory import ConversationBufferMemory |
| | from langchain.chains import ConversationalRetrievalChain |
| | from htmlTemplates import css, bot_template, user_template |
| | from langchain.llms import HuggingFaceHub |
| | import os |
| | |
| | |
| | hub_token = os.environ["HUGGINGFACE_HUB_TOKEN"] |
| |
|
| | def split_pdfs(pdf_docs): |
| | """Splits a list of PDF documents into smaller chunks. |
| | |
| | Args: |
| | pdf_docs: A list of PDF documents. |
| | |
| | Returns: |
| | A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. |
| | """ |
| |
|
| | pdf_chunks = [] |
| | for pdf_doc in pdf_docs: |
| | |
| | pdf_reader = PdfReader(pdf_doc) |
| | pdf_pages = pdf_reader.pages |
| |
|
| | |
| | pdf_chunks.append([]) |
| | for pdf_page in pdf_pages: |
| | |
| | pdf_chunks[-1].append(pdf_page) |
| |
|
| | |
| | if len(pdf_chunks[-1]) >= 10: |
| | pdf_chunks.append([]) |
| |
|
| | return pdf_chunks |
| |
|
| | def generate_response(pdf_chunks, llm_model): |
| | """Generates a response to a query using a list of PDF documents and an LLM model. |
| | |
| | Args: |
| | pdf_chunks: A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. |
| | llm_model: An LLM model. |
| | |
| | Returns: |
| | A response to the query. |
| | """ |
| |
|
| | |
| | pdf_summaries = [] |
| | for pdf_chunk in pdf_chunks: |
| | |
| | pdf_summary = llm_model.generate( |
| | prompt=f"Summarize the following text:\n{get_pdf_text(pdf_chunk)}", |
| | max_new_tokens=100 |
| | ) |
| |
|
| | |
| | pdf_summaries.append(pdf_summary) |
| |
|
| | |
| | response = llm_model.generate( |
| | prompt=f"Answer the following question using the following summaries:\n{get_text_chunks(pdf_summaries)}\n\nQuestion:", |
| | max_new_tokens=200 |
| | ) |
| |
|
| | return response |
| |
|
| | def main(): |
| | load_dotenv() |
| | st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") |
| | st.write(css, unsafe_allow_html=True) |
| |
|
| | |
| | llm_model = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", huggingfacehub_api_token=hub_token) |
| |
|
| | if "conversation" not in st.session_state: |
| | st.session_state.conversation = None |
| |
|
| | if "chat_history" not in st.session_state: |
| | st.session_state.chat_history = None |
| |
|
| | st.header("Chat with multiple PDFs :books:") |
| | user_question = st.text_input("Ask a question about your documents:") |
| |
|
| | |
| | if user_question: |
| | |
| | pdf_chunks = split_pdfs(Geeta.pdf) |
| |
|
| | |
| | response = generate_response(pdf_chunks, llm_model) |
| |
|
| | st.write(response) |
| |
|
| |
|
| |
|
| | main() |
| |
|