PDF_QA_Assistant

Sleeping

App Files Files Community

Aseem Gupta commited on May 3, 2025

Commit

35d9362

1 Parent(s): bfa0055

current alpha version for pdf's only for all users common db is there for now

Browse files

Files changed (2) hide show

app.py +137 -0
requirements.txt +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import gradio as gr
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain_chroma import Chroma
+from langchain_community.vectorstores import FAISS
+from langchain_groq import ChatGroq
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+import os
+from dotenv import load_dotenv
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+# from langchain.embeddings import HuggingFaceEmbeddings # open source free embedding
+load_dotenv()
+class PDFQAProcessor:
+    SYSTEM_PROMPT = os.getenv('SYSTEM_PROMPT')
+    llm = ChatGroq(
+        # model_name="deepseek-r1-distill-llama-70b",
+        model_name="llama-3.3-70b-versatile",
+        temperature=0.1,
+        max_tokens=8000,
+        api_key = os.getenv('GROQ_API_KEY')
+    )
+    # Setup RAG chain
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", SYSTEM_PROMPT),
+        ("human", "{input}"),
+    ])
+    question_answer_chain = create_stuff_documents_chain(llm, prompt)
+    # EMBEDDING_MODEL = "intfloat/e5-large-v2"
+    # embeddings = HuggingFaceEmbeddings(
+    #     model_name=EMBEDDING_MODEL,
+    #     model_kwargs={'device': 'cpu'},
+    #     encode_kwargs={'normalize_embeddings': True}
+    # )
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    CHUNK_SIZE = 550
+    CHUNK_OVERLAP = 80
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,chunk_overlap = CHUNK_OVERLAP)
+    # persist_directory="./chroma_db"
+    def __init__(self):
+        self.vectorstore = None
+        self.retriever = None
+    def process_pdfs(self, pdf_files):
+        """Processing PDF files and creating vector store"""
+        if not pdf_files:
+            return "Please upload PDF files first!"
+        try:
+            # Load and split documents
+            docs = []
+            for pdf_file in pdf_files:
+                loader = PyPDFLoader(pdf_file.name)
+                docs.extend(loader.load())
+            splits = self.text_splitter.split_documents(docs)
+            # # Create vector store
+            # self.vectorstore = Chroma.from_documents(
+            #     documents=splits,
+            #     embedding=self.embeddings,
+            #     # persist_directory = self.persist_directory
+            # )
+            # Replace Chroma with:
+            self.vectorstore = FAISS.from_documents(
+                splits,
+                self.embeddings
+            )
+            self.retriever = self.vectorstore.as_retriever(search_kwargs={"k": 18})
+            return "PDFs processed successfully! Ask your questions now."
+        except Exception as e:
+            return f"Error processing PDFs: {str(e)}"
+    def answer_question(self, question):
+        """Handling question answering"""
+        if not self.retriever:
+            return "Please process PDFs first!", None
+        try:
+            # Initialize LLM
+            rag_chain = create_retrieval_chain(self.retriever, self.question_answer_chain)
+            response = rag_chain.invoke({"input": question})
+            final_response = response["answer"] + "\n\n### Sources\n\n"  # Changed to use markdown formatting
+            for info in response["context"]:
+                final_response += (
+                    f"{info.page_content}<br>"  # Changed to use markdown bold formatting
+                    f"Source of Info: {info.metadata['source']}<br>"
+                    f"At Page No: {info.metadata['page_label']}<br><br>"
+                    )
+            return final_response
+        except Exception as e:
+            return f"Error answering question: {str(e)}", None
+processor = PDFQAProcessor()
+with gr.Blocks(title="PDF QA Assistant") as demo:
+    with gr.Tab("Upload PDFs"):
+        file_input = gr.Files(label="Upload PDFs", file_types=[".pdf"])
+        process_btn = gr.Button("Process PDFs")
+        status_output = gr.Textbox(label="Processing Status")
+    with gr.Tab("Ask Questions"):
+        question_input = gr.Textbox(label="Your Question")
+        # answer_output = gr.Textbox(label="Answer", interactive=False)
+        answer_output = gr.Markdown(label="Answer")
+        ask_btn = gr.Button("Ask Question")
+    process_btn.click(
+        processor.process_pdfs,
+        inputs=file_input,
+        outputs=status_output
+    )
+    # QA workflow
+    ask_btn.click(
+        processor.answer_question,
+        inputs=question_input,
+        outputs=[answer_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+gradio==5.14.0
+groq==0.15.0
+huggingface-hub==0.27.1
+langchain==0.3.15
+langchain-community==0.3.15
+langchain-core==0.3.31
+langchain-experimental==0.3.4
+langchain-google-genai==2.0.9
+langchain-groq==0.2.3
+langchain-text-splitters==0.3.5
+nltk==3.9.1
+python-dotenv==1.0.1
+sentence-transformers==3.4.0
+tokenizers==0.20.3
+torch==2.5.1
+transformers==4.46.3
+unstructured==0.16.15
+faiss-cpu