Spaces:

manoj1hcl
/

ChatWIthPdf

Sleeping

App Files Files Community

manoj1hcl commited on Feb 1

Commit

aca5cc5

verified ·

1 Parent(s): bbf1ecd

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -74

app.py CHANGED Viewed

@@ -1,117 +1,122 @@
 import os
-import pdfplumber
 from dotenv import load_dotenv
 import gradio as gr
 from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_chroma import Chroma
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.chains import RetrievalQA
-from langchain.llms import OpenAI
-from langchain.document_loaders import TextLoader
-from langchain.docstore.document import Document
-from transformers import AutoTokenizer
-from langchain.document_loaders import PyPDFLoader
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
-# price is a factor for our company, so we're going to use a low cost model
 MODEL = "gpt-4o-mini"
-db_name = "vector_db"
-# Load environment variables in a file called .env
 load_dotenv(override=True)
 def process_pdf(pdf_file):
     try:
-        loader = PyPDFLoader(pdf_file.name)
         pages = loader.load()
         if not pages:
-            raise ValueError("No text found in padf.")
-        text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=500,
-            chunk_overlap=50
         )
-        chunks = text_splitter.split_documents(pages)
-        if not chunks:
-            raise ValueError("Unable to split the PDF into chunks.")
         if not chunks:
-            raise ValueError(f"ERROR: File is ecrypted/protected No text chunks generated fro {pdf_file}.")
         embeddings = OpenAIEmbeddings()
-        #print(chunks)
-        if os.path.exists(db_name):
-            Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
-        # Embed the chunks with OpenAI Embeddings
-        vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
-        # Sample embedding dimension
-        collection = vectorstore._collection
-        sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
-        dimensions = len(sample_embedding)
-        print(f"The vectors have {dimensions:,} dimensions")
-        # Create the OpenAI Chat Model
-        llm = ChatOpenAI(temperature=0.7, model=MODEL)  # Or another model
-        # Set up conversation memory
-        memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
-        # Set up the retriever (vector store)
-        retriever = vectorstore.as_retriever()
-        # Set up the Conversational Retrieval Chain
-        conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
-        # Return the conversation chain
         return conversation_chain
     except Exception as e:
         raise RuntimeError(f"PDF processing failed: {str(e)}")
-# Function to upload PDF
 def upload_pdf(file):
     global chain
     if file is None:
         chain = None
-        return "pleae upload the file!"
     chain = process_pdf(file)
-    return "processed the file ask questions"
-# ask_question function
 def ask_question(message, history):
     if chain is None:
-        return "upload the pdf first"
-    else:
-        try:
-            result = chain.invoke({"question":message})
-            answer = result.get("answer", "No answer found.")
-        except Exception as e:
-            answer = f"Error:{str(e)}"
-    history.append((message, answer))
     return history, history, ""
-# Building Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("## Chat with your pdf!!")
-    # File uploader
     file_input = gr.File(label="Upload your PDF", file_types=[".pdf"])
-    # Status text
     status = gr.Textbox(label="Status", interactive=False)
-    chatbot = gr.Chatbot(label="Chat history!!!")
-    msg=gr.Textbox(label="Ask anything related to pdf...")
     clear = gr.Button("Clear chat")
-    state = gr.State([])
     file_input.change(upload_pdf, inputs=[file_input], outputs=[status])
-    msg.submit(ask_question, [msg, state], [chatbot, state, msg])
-    clear.click(lambda: ([],[]), None, [chatbot, state])
-    chain = None  # global QA chain
-# Launch the app
-demo.launch(inline=False)

 import os
+import shutil
 from dotenv import load_dotenv
 import gradio as gr
 from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_chroma import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
+# Low-cost model
 MODEL = "gpt-4o-mini"
+DB_DIR = "vector_db"
 load_dotenv(override=True)
+chain = None  # global chain (OK for single-user; see note below)
 def process_pdf(pdf_file):
     try:
+        if not os.getenv("OPENAI_API_KEY"):
+            raise RuntimeError(
+                "OPENAI_API_KEY is not set. Add it to your environment or as a Secret on HF Spaces."
+            )
+        file_path = pdf_file.name  # gr.File gives a temp file with .name path
+        loader = PyPDFLoader(file_path)
         pages = loader.load()
         if not pages:
+            raise ValueError("No text found in PDF (may be scanned or protected).")
+        splitter = RecursiveCharacterTextSplitter(
             chunk_size=500,
+            chunk_overlap=50,
         )
+        chunks = splitter.split_documents(pages)
         if not chunks:
+            raise ValueError("Unable to split PDF into chunks (empty/protected PDF).")
+        # Embeddings (you can also specify: model="text-embedding-3-small")
         embeddings = OpenAIEmbeddings()
+        # Reset persisted DB each upload
+        if os.path.exists(DB_DIR):
+            shutil.rmtree(DB_DIR, ignore_errors=True)
+        vectorstore = Chroma.from_documents(
+            documents=chunks,
+            embedding=embeddings,
+            persist_directory=DB_DIR,
+        )
+        llm = ChatOpenAI(model=MODEL, temperature=0.2)
+        memory = ConversationBufferMemory(
+            memory_key="chat_history",
+            return_messages=True,
+        )
+        retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
+        conversation_chain = ConversationalRetrievalChain.from_llm(
+            llm=llm,
+            retriever=retriever,
+            memory=memory,
+        )
         return conversation_chain
     except Exception as e:
         raise RuntimeError(f"PDF processing failed: {str(e)}")
 def upload_pdf(file):
     global chain
     if file is None:
         chain = None
+        return "Please upload a PDF."
     chain = process_pdf(file)
+    return "PDF processed. Ask questions now."
 def ask_question(message, history):
     if chain is None:
+        history = history or []
+        history.append({"role": "assistant", "content": "Upload the PDF first."})
+        return history, history, ""
+    try:
+        result = chain.invoke({"question": message})
+        answer = result.get("answer", "No answer found.")
+    except Exception as e:
+        answer = f"Error: {str(e)}"
+    history = history or []
+    history.append({"role": "user", "content": message})
+    history.append({"role": "assistant", "content": answer})
     return history, history, ""
 with gr.Blocks() as demo:
+    gr.Markdown("## Chat with your PDF")
     file_input = gr.File(label="Upload your PDF", file_types=[".pdf"])
     status = gr.Textbox(label="Status", interactive=False)
+    chatbot = gr.Chatbot(label="Chat history", type="messages")
+    msg = gr.Textbox(label="Ask anything related to the PDF...")
     clear = gr.Button("Clear chat")
+    state = gr.State([])
     file_input.change(upload_pdf, inputs=[file_input], outputs=[status])
+    msg.submit(ask_question, inputs=[msg, state], outputs=[chatbot, state, msg])
+    clear.click(lambda: ([], []), inputs=None, outputs=[chatbot, state])
+demo.launch(inline=False)