Spaces:

dnzblgn
/

RAG_PDF_langchain

Running

App Files Files Community

dnzblgn commited on Feb 2, 2025

Commit

e4d5b9b

verified ·

1 Parent(s): 18541f2

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -73

app.py CHANGED Viewed

@@ -1,91 +1,176 @@
-import os
 import gradio as gr
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.llms import HuggingFaceEndpoint
-from langchain_community.vectorstores import FAISS
-from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
-# Initialize embeddings
-embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-# Initialize Mistral LLM
-llm = HuggingFaceEndpoint(
-    endpoint_url="https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2",
-    huggingfacehub_api_token=os.getenv("HF_TOKEN"),
-    task="text-generation",
-)
-def process_pdf(pdf_file):
-    # Load PDF
-    loader = PyPDFLoader(pdf_file)
-    documents = loader.load()
-    # Split text into chunks
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
     )
-    chunks = text_splitter.split_documents(documents)
-    # Create vector store
-    vectorstore = FAISS.from_documents(chunks, embeddings)
-    return vectorstore
-def setup_rag_chain(vectorstore):
-    memory = ConversationBufferMemory(
-        memory_key="chat_history",
-        return_messages=True,
-        output_key='answer'
-    )
-    chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
-        retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
         memory=memory,
-        return_source_documents=True,
-        chain_type="stuff",
-        verbose=True
     )
-    return chain
-def get_response(query, chain):
-    result = chain({"question": query})
-    return result['answer']
-def create_demo():
-    def process_file(file):
-        vectorstore = process_pdf(file.name)
-        return setup_rag_chain(vectorstore)
-    def respond(message, history, chain_state):
-    if chain_state is None:
-        return history + [["Please upload a PDF first.", None]]
-    response = get_response(message, chain_state)
-    history = history + [[message, response]]
-    return history
-    with gr.Blocks() as demo:
-        chain_state = gr.State(None)
-        with gr.Row():
-            file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-        chatbot = gr.Chatbot()
-        msg = gr.Textbox(label="Question")
-        clear = gr.Button("Clear")
-        file_input.upload(fn=process_file, outputs=[chain_state])
-        msg.submit(fn=respond, inputs=[msg, chatbot, chain_state], outputs=[chatbot])
-        clear.click(lambda: None, None, chatbot, queue=False)
-    return demo
 if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch()

 import gradio as gr
+import os
+import time
+import PyPDF2
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
 from langchain.chains import ConversationalRetrievalChain
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.memory import ConversationBufferMemory
+from langchain_community.llms import HuggingFaceEndpoint
+def read_file(file_path):
+    try:
+        if file_path.endswith(".txt"):
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+        elif file_path.endswith(".pdf"):
+            content = ""
+            with open(file_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                for page in reader.pages:
+                    content += page.extract_text() + "\n"
+        else:
+            return None, "Unsupported file format. Please upload a .txt or .pdf file."
+        if not content.strip():
+            return None, "File is empty. Please upload a valid document."
+        return content, "Successfully processed the uploaded file! Ready for questions."
+    except Exception as e:
+        return None, f"Error reading file: {str(e)}"
+def create_db_from_text(text):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
+    splits = text_splitter.create_documents([text])
+    embeddings = HuggingFaceEmbeddings()
+    vector_db = FAISS.from_documents(splits, embeddings)
+    return vector_db
+def initialize_chatbot(vector_db):
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    retriever = vector_db.as_retriever()
+    llm = HuggingFaceEndpoint(
+        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
+        huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN"),
+        temperature=0.5,
+        max_new_tokens=256
     )
+    qa_chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
+        retriever=retriever,
         memory=memory,
+        verbose=False
     )
+    return qa_chain
+def process_and_initialize(file):
+    if file is None:
+        return None, None, "Please upload a file first."
+    try:
+        text, status_message = read_file(file)
+        if text is None:
+            return None, None, status_message
+        db = create_db_from_text(text)
+        qa = initialize_chatbot(db)
+        return db, qa, status_message
+    except Exception as e:
+        return None, None, f"Processing error: {str(e)}"
+def user_query_typing_effect(query, qa_chain, chatbot):
+    history = chatbot or []
+    try:
+        response = qa_chain.invoke({"question": query, "chat_history": []})
+        assistant_response = response["answer"]
+        history.append({"role": "user", "content": query})
+        history.append({"role": "assistant", "content": ""})
+        for i in range(len(assistant_response)):
+            history[-1]["content"] += assistant_response[i]
+            yield history, ""
+            time.sleep(0.05)
+    except Exception as e:
+        history.append({"role": "assistant", "content": f"Error: {str(e)}"})
+        yield history, ""
+def demo():
+    custom_css = """
+    body {
+        background-color: #FF8C00;
+        font-family: Arial, sans-serif;
+    }
+    .gradio-container {
+        border-radius: 15px;
+        box-shadow: 0px 4px 20px rgba(0, 0, 0, 0.3);
+        padding: 20px;
+    }
+    footer {
+        visibility: hidden;
+    }
+    .chatbot {
+        border: 2px solid #000;
+        border-radius: 10px;
+        background-color: #FFF5E1;
+    }
+    """
+    with gr.Blocks(css=custom_css) as app:
+        vector_db = gr.State(None)
+        qa_chain = gr.State(None)
+        gr.Markdown("### 🌟 **Document-Based Chatbot** 🌟")
+        gr.Markdown("#### Upload your document and ask questions interactively!")
+        with gr.Row():
+            with gr.Column(scale=1):
+                txt_file = gr.File(
+                    label="📁 Upload Document",
+                    file_types=[".txt", ".pdf"],
+                    type="filepath"
+                )
+                analyze_btn = gr.Button("🚀 Process Document")
+                status = gr.Textbox(
+                    label="📊 Status",
+                    placeholder="Status updates will appear here...",
+                    interactive=False
+                )
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(
+                    label="🤖 Chat with your data",
+                    height=600,
+                    bubble_full_width=False,
+                    show_label=False,
+                    render_markdown=True,
+                    type="messages",
+                    elem_classes=["chatbot"]
+                )
+                query_input = gr.Textbox(
+                    label="Ask a question",
+                    placeholder="Ask about the document...",
+                    show_label=False,
+                    container=False
+                )
+                query_btn = gr.Button("Ask")
+        analyze_btn.click(
+            fn=process_and_initialize,
+            inputs=[txt_file],
+            outputs=[vector_db, qa_chain, status],
+            show_progress="minimal"
+        )
+        query_btn.click(
+            fn=user_query_typing_effect,
+            inputs=[query_input, qa_chain, chatbot],
+            outputs=[chatbot, query_input],
+            show_progress="minimal"
+        )
+        query_input.submit(
+            fn=user_query_typing_effect,
+            inputs=[query_input, qa_chain, chatbot],
+            outputs=[chatbot, query_input],
+            show_progress="minimal"
+        )
+    app.launch()
 if __name__ == "__main__":
+    demo()