import streamlit as st import io from pypdf import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema import Document from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from langchain_community.llms import HuggingFaceHub import os model_list = ['google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large'] def get_llm(model_name): llm = HuggingFaceHub(repo_id=model_name, model_kwargs={"temperature": 0.5, "max_length": int(os.environ.get("MAX_LENGTH"))}) return llm template = """ Try to answer the Question based on the Context. Context: {context} Question: {question} Answer:""" prompt = PromptTemplate.from_template(template) embedding_model = HuggingFaceEmbeddings(model_name=os.environ.get("EMB_MODEL")) def read_pdf(files): try: all_pdf_texts = "" for file_contents in files: reader = PdfReader(file_contents) pdf_texts = [p.extract_text().strip() for p in reader.pages] pdf_texts = [text for text in pdf_texts if text] all_pdf_texts += "\n\n".join(pdf_texts) return all_pdf_texts except Exception as e: print("Error faced in Read PDF -",e) return "" def chunking(doc_text): try: text_splitter = RecursiveCharacterTextSplitter(chunk_size = int(os.environ.get("CHUNK_SIZE")), chunk_overlap = int(os.environ.get("CHUNK_OVERLAP")), length_function = len) chunks = text_splitter.split_text(doc_text) return chunks except Exception as e: print("Error faced in Chunking -",e) return doc_text def vectorize_text(pdf_chunks): with st.spinner("Indexing into DB..."): return embedding_model.embed_documents(pdf_chunks) def main(): # UI st.set_page_config(page_title="inquiry") st.title("Document Inquiry Tool") st.caption("Document Inquiry Tool is designed to respond comprehensively to questions posed about the provided document, regardless of the section from which the questions originate.") st.subheader("Step 1 - Upload the Document") # File uploader uploaded_files = st.file_uploader("Choose a file", type=["pdf"], accept_multiple_files=True) pdf_chunks = [] rerun_switch = False # Initialize session state if "ip_files" not in st.session_state: st.session_state.ip_files = [] st.session_state.pdf_texts = "" if 'db' not in st.session_state: st.session_state.db = None if "pdf_chunks" not in st.session_state: st.session_state.pdf_chunks = [] if uploaded_files != []: with st.spinner("Reading the file..."): if st.session_state.ip_files != uploaded_files: st.session_state.ip_files = uploaded_files st.session_state.pdf_texts = read_pdf(uploaded_files) rerun_switch = True # to reindex with all new files # Collapsible section for Preview with st.expander("click here to see the document content", expanded=False): st.text_area("Document Content Preview", st.session_state.pdf_texts, height=400) # Chunking with st.spinner("Chunking..."): if st.session_state.pdf_chunks == [] or rerun_switch: st.session_state.pdf_chunks = chunking(st.session_state.pdf_texts) st.session_state.pdf_chunks = list(map(lambda x: Document(x), st.session_state.pdf_chunks)) # Vectorizing with st.spinner("Indexing into DB..."): if st.session_state.db is None or rerun_switch: st.session_state.db = FAISS.from_documents(st.session_state.pdf_chunks, embedding_model) rerun_switch = False # Section for user query st.subheader("Step 2 - Ask a Question") model_name = st.selectbox("Select the LLM model:", model_list) llm = get_llm(model_name) llm_chain = LLMChain(prompt=prompt, llm=llm) user_query = st.text_area("Type your question here", height=100) topn = st.session_state.db.similarity_search(user_query, fetch_k=5) # Fetch Answer Button if st.button("Find Answer"): with st.spinner("Generating..."): st.success(llm_chain.run({"question": user_query, "context": topn})) else: # Reset the DB ids = [] for i in range(len(st.session_state.pdf_chunks)): try: ids.append(st.session_state.db.index_to_docstore_id[i]) except: break try: st.session_state.db.delete(ids) except Exception as e: pass st.session_state.pdf_chunks = [] st.session_state.db = None if __name__ == '__main__': main()