Spaces:

dnzblgn
/

RAG_PDF_langchain

Sleeping

App Files Files Community

dnzblgn commited on Feb 2, 2025

Commit

6faf44c

verified ·

1 Parent(s): c878a32

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -59

app.py CHANGED Viewed

@@ -1,56 +1,39 @@
 import gradio as gr
 import os
 import time
-import PyPDF2
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain.chains import ConversationalRetrievalChain
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.memory import ConversationBufferMemory
 from langchain_community.llms import HuggingFaceEndpoint
-def read_file(file_path):
-    try:
-        if file_path.endswith(".txt"):
-            with open(file_path, "r", encoding="utf-8") as f:
-                content = f.read()
-        elif file_path.endswith(".pdf"):
-            content = ""
-            with open(file_path, "rb") as f:
-                reader = PyPDF2.PdfReader(f)
-                for page in reader.pages:
-                    content += page.extract_text() + "\n"
-        else:
-            return None, "Unsupported file format. Please upload a .txt or .pdf file."
-        if not content.strip():
-            return None, "File is empty. Please upload a valid document."
-        return content, "Successfully processed the uploaded file! Ready for questions."
-    except Exception as e:
-        return None, f"Error reading file: {str(e)}"
-def create_db_from_text(text):
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
-    splits = text_splitter.create_documents([text])
-    # Specify an explicit model for embeddings
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    vector_db = FAISS.from_documents(splits, embeddings)
-    return vector_db
 def initialize_chatbot(vector_db):
     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
     retriever = vector_db.as_retriever()
     llm = HuggingFaceEndpoint(
         repo_id="mistralai/Mistral-7B-Instruct-v0.2",
         huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN"),
         temperature=0.5,
         max_new_tokens=256
     )
     qa_chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=retriever,
@@ -59,19 +42,16 @@ def initialize_chatbot(vector_db):
     )
     return qa_chain
-def process_and_initialize(file):
-    if file is None:
         return None, None, "Please upload a file first."
     try:
-        text, status_message = read_file(file)
-        if text is None:
-            return None, None, status_message
-        db = create_db_from_text(text)
         qa = initialize_chatbot(db)
-        return db, qa, status_message
     except Exception as e:
         return None, None, f"Processing error: {str(e)}"
@@ -80,10 +60,8 @@ def user_query_typing_effect(query, qa_chain, chatbot):
     try:
         response = qa_chain.invoke({"question": query, "chat_history": []})
         assistant_response = response["answer"]
         history.append({"role": "user", "content": query})
         history.append({"role": "assistant", "content": ""})
         for i in range(len(assistant_response)):
             history[-1]["content"] += assistant_response[i]
             yield history, ""
@@ -112,28 +90,24 @@ def demo():
         background-color: #FFF5E1;
     }
     """
     with gr.Blocks(css=custom_css) as app:
-        vector_db = gr.State(None)
-        qa_chain = gr.State(None)
-        gr.Markdown("### 🌟 **Document-Based Chatbot** 🌟")
         gr.Markdown("#### Upload your document and ask questions interactively!")
         with gr.Row():
             with gr.Column(scale=1):
-                txt_file = gr.File(
-                    label="📁 Upload Document",
                     file_types=[".txt", ".pdf"],
-                    type="filepath"
                 )
-                analyze_btn = gr.Button("🚀 Process Document")
                 status = gr.Textbox(
                     label="📊 Status",
                     placeholder="Status updates will appear here...",
                     interactive=False
                 )
             with gr.Column(scale=3):
                 chatbot = gr.Chatbot(
                     label="🤖 Chat with your data",
@@ -151,29 +125,25 @@ def demo():
                     container=False
                 )
                 query_btn = gr.Button("Ask")
         analyze_btn.click(
             fn=process_and_initialize,
             inputs=[txt_file],
             outputs=[vector_db, qa_chain, status],
             show_progress="minimal"
         )
         query_btn.click(
             fn=user_query_typing_effect,
             inputs=[query_input, qa_chain, chatbot],
             outputs=[chatbot, query_input],
             show_progress="minimal"
         )
         query_input.submit(
             fn=user_query_typing_effect,
             inputs=[query_input, qa_chain, chatbot],
             outputs=[chatbot, query_input],
             show_progress="minimal"
         )
     app.launch()
 if __name__ == "__main__":
-    demo()

 import gradio as gr
 import os
 import time
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain.chains import ConversationalRetrievalChain
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.memory import ConversationBufferMemory
 from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.document_loaders import PyPDFLoader
+def load_doc(list_file_path):
+    loaders = [PyPDFLoader(x) for x in list_file_path]
+    pages = []
+    for loader in loaders:
+        pages.extend(loader.load())
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1024, chunk_overlap=64
+    )
+    doc_splits = text_splitter.split_documents(pages)
+    return doc_splits
+def create_db(splits):
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    vectordb = FAISS.from_documents(splits, embeddings)
+    return vectordb
 def initialize_chatbot(vector_db):
     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
     retriever = vector_db.as_retriever()
     llm = HuggingFaceEndpoint(
         repo_id="mistralai/Mistral-7B-Instruct-v0.2",
         huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN"),
         temperature=0.5,
         max_new_tokens=256
     )
     qa_chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=retriever,
     )
     return qa_chain
+def process_and_initialize(files):
+    if not files:
         return None, None, "Please upload a file first."
     try:
+        list_file_path = [file.name for file in files if file is not None]
+        doc_splits = load_doc(list_file_path)
+        db = create_db(doc_splits)
         qa = initialize_chatbot(db)
+        return db, qa, "Database created! Ready for questions."
     except Exception as e:
         return None, None, f"Processing error: {str(e)}"
     try:
         response = qa_chain.invoke({"question": query, "chat_history": []})
         assistant_response = response["answer"]
         history.append({"role": "user", "content": query})
         history.append({"role": "assistant", "content": ""})
         for i in range(len(assistant_response)):
             history[-1]["content"] += assistant_response[i]
             yield history, ""
         background-color: #FFF5E1;
     }
     """
     with gr.Blocks(css=custom_css) as app:
+        vector_db = gr.State()
+        qa_chain = gr.State()
+        gr.Markdown("### 🌟 **PDF & TXT Chatbot** 🌟")
         gr.Markdown("#### Upload your document and ask questions interactively!")
         with gr.Row():
             with gr.Column(scale=1):
+                txt_file = gr.Files(
+                    label="📁 Upload Documents",
                     file_types=[".txt", ".pdf"],
+                    type="file"
                 )
+                analyze_btn = gr.Button("🚀 Process Documents")
                 status = gr.Textbox(
                     label="📊 Status",
                     placeholder="Status updates will appear here...",
                     interactive=False
                 )
             with gr.Column(scale=3):
                 chatbot = gr.Chatbot(
                     label="🤖 Chat with your data",
                     container=False
                 )
                 query_btn = gr.Button("Ask")
         analyze_btn.click(
             fn=process_and_initialize,
             inputs=[txt_file],
             outputs=[vector_db, qa_chain, status],
             show_progress="minimal"
         )
         query_btn.click(
             fn=user_query_typing_effect,
             inputs=[query_input, qa_chain, chatbot],
             outputs=[chatbot, query_input],
             show_progress="minimal"
         )
         query_input.submit(
             fn=user_query_typing_effect,
             inputs=[query_input, qa_chain, chatbot],
             outputs=[chatbot, query_input],
             show_progress="minimal"
         )
     app.launch()
 if __name__ == "__main__":
+    demo()