Spaces:

19arjun89
/

RAG

Sleeping

App Files Files Community

Arjun Singh commited on Feb 16, 2025

Commit

d8f0836

1 Parent(s): 7746d4c

Build PDF RAG

Browse files

Files changed (2) hide show

app.py +122 -0
requirements.txt +17 -0

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.schema import HumanMessage
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain_chroma import Chroma
+from langchain_groq import ChatGroq
+import gradio as gr
+# Initialize ChromaDB and Groq API
+DB_DIR = "chroma_db"
+COLLECTION_NAME = "document_collection"
+embedding_function = HuggingFaceEmbeddings()
+GROQ_API_KEY = groq_api_key = os.environ.get("GROQ_API_KEY")
+llm = ChatGroq(api_key=GROQ_API_KEY, model_name="llama-3.1-8b-instant")
+# Keep track of current document ID
+current_document_id = None
+def load_and_split_document(file_path):
+    """Loads a document and splits it into chunks."""
+    loader = UnstructuredFileLoader(file_path)
+    documents = loader.load()
+    text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=50)
+    chunks = text_splitter.split_documents(documents)
+    return chunks
+def upload_and_process(file):
+    """Processes uploaded file and stores it in ChromaDB."""
+    try:
+        global current_document_id
+        uploaded_file_path = file.name
+        # Generate a unique document ID (using filename in this case)
+        current_document_id = os.path.basename(uploaded_file_path)
+        # Load and split the document
+        chunks = load_and_split_document(uploaded_file_path)
+        # Add document ID as metadata to each chunk
+        for chunk in chunks:
+            chunk.metadata['document_id'] = current_document_id
+        # Get or create vector store
+        vector_store = Chroma(
+            persist_directory=DB_DIR,
+            embedding_function=embedding_function,
+            collection_name=COLLECTION_NAME
+        )
+        # Add new documents
+        vector_store.add_documents(chunks)
+        return f"Document successfully processed: {current_document_id}"
+    except Exception as e:
+        return f"Error processing document: {str(e)}"
+def retrieve_and_generate_response(query):
+    """Retrieves relevant text and uses Groq LLM to generate a response."""
+    try:
+        vector_store = Chroma(
+            persist_directory=DB_DIR,
+            embedding_function=embedding_function,
+            collection_name=COLLECTION_NAME
+        )
+        # Only search within the current document
+        if current_document_id:
+            filter_dict = {"document_id": current_document_id}
+            results = vector_store.similarity_search(
+                query,
+                k=2,
+                filter=filter_dict
+            )
+        else:
+            return "Please upload a document first."
+        retrieved_texts = [doc.page_content for doc in results]
+        context = "\n".join(retrieved_texts)
+        if not context:
+            return "No relevant content found in the current document."
+        messages = [
+            HumanMessage(content=f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}")
+        ]
+        response = llm.invoke(messages)
+        return response.content
+    except Exception as e:
+        return f"Error generating response: {str(e)}"
+# Define the Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 RAG Chatbot with Groq & ChromaDB")
+    file_input = gr.File(label="Upload a PDF")
+    upload_button = gr.Button("Process Document")
+    upload_status = gr.Textbox(label="Upload Status", interactive=False)
+    query_input = gr.Textbox(label="Ask a Question")
+    response_output = gr.Textbox(label="Response", interactive=False)
+    chat_button = gr.Button("Get Answer")
+    upload_button.click(
+        upload_and_process,
+        inputs=[file_input],
+        outputs=[upload_status]
+    )
+    chat_button.click(
+        retrieve_and_generate_response,  # Use the function directly
+        inputs=[query_input],
+        outputs=[response_output]
+    )
+# Launch the Gradio app
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+langchain
+langchain-community
+langchain-groq
+chromadb
+sentence-transformers
+gradio
+unstructured
+pdf2image
+python-magic
+pdfminer.six
+nltk
+transformers
+torch
+numpy
+Pillow
+pypdf
+python-docx