Spaces:
Build error
Build error
| import streamlit as st | |
| from dotenv import load_dotenv | |
| import os | |
| import traceback | |
| # PDF and NLP Libraries | |
| import PyPDF2 | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer, util | |
| # Embedding and Vector Store | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| # LLM and Conversational Chain | |
| from langchain_groq import ChatGroq | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.prompts import PromptTemplate | |
| # Custom Templates | |
| from htmlTemplate import css, bot_template, user_template | |
| # Load environment variables | |
| os.environ["GROQ_API_KEY"]= os.getenv('GROQ_API_KEY') | |
| # LLM Template for focused responses | |
| llmtemplate = """You're an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity. | |
| Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address: | |
| {question} | |
| Keep in mind the following instructions: | |
| - Your response should be direct and factual, limited to 50 words and 2-3 sentences. | |
| - Avoid using introductory phrases like "yes" or "no." | |
| - Maintain an ethical and unbiased tone, steering clear of harmful or offensive content. | |
| - If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document." | |
| - Do not fabricate information, include questions, or use confirmatory phrases. | |
| - Remember not to prompt for additional information or ask any questions. | |
| Ensure your response is strictly based on the content of the markdown document. | |
| """ | |
| def prepare_docs(pdf_docs): | |
| """Extract text from uploaded PDF documents""" | |
| docs = [] | |
| metadata = [] | |
| content = [] | |
| for pdf in pdf_docs: | |
| pdf_reader = PyPDF2.PdfReader(pdf) | |
| for index, text in enumerate(pdf_reader.pages): | |
| doc_page = { | |
| 'title': f"{pdf.name} page {index + 1}", | |
| 'content': pdf_reader.pages[index].extract_text() | |
| } | |
| docs.append(doc_page) | |
| for doc in docs: | |
| content.append(doc["content"]) | |
| metadata.append({"title": doc["title"]}) | |
| return content, metadata | |
| def get_text_chunks(content, metadata): | |
| """Split documents into manageable chunks""" | |
| text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
| chunk_size=1024, | |
| chunk_overlap=256, | |
| ) | |
| split_docs = text_splitter.create_documents(content, metadatas=metadata) | |
| print(f"Split documents into {len(split_docs)} passages") | |
| return split_docs | |
| def ingest_into_vectordb(split_docs): | |
| """Create vector embeddings and store in FAISS""" | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={'device':'cpu'} | |
| ) | |
| db = FAISS.from_documents(split_docs, embeddings) | |
| DB_FAISS_PATH = 'vectorstore/db_faiss' | |
| db.save_local(DB_FAISS_PATH) | |
| return db | |
| def get_conversation_chain(vectordb): | |
| """Create conversational retrieval chain""" | |
| llm = ChatGroq(model="llama3-70b-8192", temperature=0.25) | |
| retriever = vectordb.as_retriever() | |
| memory = ConversationBufferMemory( | |
| memory_key='chat_history', | |
| return_messages=True, | |
| output_key='answer' | |
| ) | |
| conversation_chain = ConversationalRetrievalChain.from_llm( | |
| llm=llm, | |
| retriever=retriever, | |
| memory=memory, | |
| return_source_documents=True | |
| ) | |
| print("Conversational Chain created for the LLM using the vector store") | |
| return conversation_chain | |
| def validate_answer_against_sources(response_answer, source_documents): | |
| """Validate AI's response against source documents""" | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| similarity_threshold = 0.5 | |
| source_texts = [doc.page_content for doc in source_documents] | |
| answer_embedding = model.encode(response_answer, convert_to_tensor=True) | |
| source_embeddings = model.encode(source_texts, convert_to_tensor=True) | |
| cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings) | |
| return any(score.item() > similarity_threshold for score in cosine_scores[0]) | |
| def handle_userinput(user_question): | |
| """Process user input and display chat history""" | |
| response = st.session_state.conversation({'question': user_question}) | |
| st.session_state.chat_history = response['chat_history'] | |
| for i, message in enumerate(st.session_state.chat_history): | |
| if i % 2 == 0: | |
| st.write(user_template.replace( | |
| "{{MSG}}", message.content), unsafe_allow_html=True) | |
| else: | |
| st.write(bot_template.replace( | |
| "{{MSG}}", message.content), unsafe_allow_html=True) | |
| def main(): | |
| """Main Streamlit application""" | |
| load_dotenv() | |
| st.set_page_config( | |
| page_title="PDF Insights AI", | |
| page_icon=":books:", | |
| layout="wide" | |
| ) | |
| st.write(css, unsafe_allow_html=True) | |
| # Welcome section | |
| st.title("π PDF Insights AI") | |
| st.markdown(""" | |
| ### Unlock the Knowledge in Your PDFs | |
| - π€ AI-powered document analysis | |
| - π¬ Ask questions about your uploaded documents | |
| - π Support for multiple PDF files | |
| """) | |
| # Initialize session state | |
| if "conversation" not in st.session_state: | |
| st.session_state.conversation = None | |
| if "chat_history" not in st.session_state: | |
| st.session_state.chat_history = [] | |
| # File upload section | |
| with st.sidebar: | |
| st.header("π€ Upload Documents") | |
| pdf_docs = st.file_uploader( | |
| "Upload your PDFs here", | |
| type=['pdf'], | |
| accept_multiple_files=True, | |
| help="Upload PDF files to analyze. Max file size: 200MB" | |
| ) | |
| # File validation | |
| if pdf_docs: | |
| for doc in pdf_docs: | |
| if doc.size > 200 * 1024 * 1024: # 200 MB | |
| st.error(f"File {doc.name} is too large. Maximum file size is 200MB.") | |
| pdf_docs.remove(doc) | |
| if st.button("Process Documents", type="primary"): | |
| if not pdf_docs: | |
| st.warning("Please upload at least one PDF file.") | |
| else: | |
| with st.spinner("Processing your documents..."): | |
| try: | |
| # Process documents | |
| content, metadata = prepare_docs(pdf_docs) | |
| split_docs = get_text_chunks(content, metadata) | |
| vectorstore = ingest_into_vectordb(split_docs) | |
| st.session_state.conversation = get_conversation_chain(vectorstore) | |
| st.success("Documents processed successfully! You can now ask questions.") | |
| except Exception as e: | |
| st.error(f"An error occurred while processing documents: {str(e)}") | |
| # Question input section | |
| user_question = st.text_input( | |
| "π Ask a question about your documents", | |
| placeholder="What insights can you provide from these documents?" | |
| ) | |
| if user_question: | |
| if st.session_state.conversation is None: | |
| st.warning("Please upload and process documents first.") | |
| else: | |
| handle_userinput(user_question) | |
| if __name__ == '__main__': | |
| main() |