Arjun Singh commited on
Commit
d8f0836
·
1 Parent(s): 7746d4c

Build PDF RAG

Browse files
Files changed (2) hide show
  1. app.py +122 -0
  2. requirements.txt +17 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_text_splitters import CharacterTextSplitter
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain.schema import HumanMessage
5
+ from langchain.document_loaders import UnstructuredFileLoader
6
+ from langchain_chroma import Chroma
7
+ from langchain_groq import ChatGroq
8
+ import gradio as gr
9
+
10
+ # Initialize ChromaDB and Groq API
11
+ DB_DIR = "chroma_db"
12
+ COLLECTION_NAME = "document_collection"
13
+ embedding_function = HuggingFaceEmbeddings()
14
+
15
+ GROQ_API_KEY = groq_api_key = os.environ.get("GROQ_API_KEY")
16
+ llm = ChatGroq(api_key=GROQ_API_KEY, model_name="llama-3.1-8b-instant")
17
+
18
+ # Keep track of current document ID
19
+ current_document_id = None
20
+
21
+ def load_and_split_document(file_path):
22
+ """Loads a document and splits it into chunks."""
23
+ loader = UnstructuredFileLoader(file_path)
24
+ documents = loader.load()
25
+
26
+ text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=50)
27
+ chunks = text_splitter.split_documents(documents)
28
+
29
+ return chunks
30
+
31
+ def upload_and_process(file):
32
+ """Processes uploaded file and stores it in ChromaDB."""
33
+ try:
34
+ global current_document_id
35
+ uploaded_file_path = file.name
36
+
37
+ # Generate a unique document ID (using filename in this case)
38
+ current_document_id = os.path.basename(uploaded_file_path)
39
+
40
+ # Load and split the document
41
+ chunks = load_and_split_document(uploaded_file_path)
42
+
43
+ # Add document ID as metadata to each chunk
44
+ for chunk in chunks:
45
+ chunk.metadata['document_id'] = current_document_id
46
+
47
+ # Get or create vector store
48
+ vector_store = Chroma(
49
+ persist_directory=DB_DIR,
50
+ embedding_function=embedding_function,
51
+ collection_name=COLLECTION_NAME
52
+ )
53
+
54
+ # Add new documents
55
+ vector_store.add_documents(chunks)
56
+
57
+ return f"Document successfully processed: {current_document_id}"
58
+ except Exception as e:
59
+ return f"Error processing document: {str(e)}"
60
+
61
+ def retrieve_and_generate_response(query):
62
+ """Retrieves relevant text and uses Groq LLM to generate a response."""
63
+ try:
64
+ vector_store = Chroma(
65
+ persist_directory=DB_DIR,
66
+ embedding_function=embedding_function,
67
+ collection_name=COLLECTION_NAME
68
+ )
69
+
70
+ # Only search within the current document
71
+ if current_document_id:
72
+ filter_dict = {"document_id": current_document_id}
73
+ results = vector_store.similarity_search(
74
+ query,
75
+ k=2,
76
+ filter=filter_dict
77
+ )
78
+ else:
79
+ return "Please upload a document first."
80
+
81
+ retrieved_texts = [doc.page_content for doc in results]
82
+ context = "\n".join(retrieved_texts)
83
+
84
+ if not context:
85
+ return "No relevant content found in the current document."
86
+
87
+ messages = [
88
+ HumanMessage(content=f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}")
89
+ ]
90
+
91
+ response = llm.invoke(messages)
92
+ return response.content
93
+ except Exception as e:
94
+ return f"Error generating response: {str(e)}"
95
+
96
+ # Define the Gradio UI
97
+ with gr.Blocks() as demo:
98
+ gr.Markdown("# 🤖 RAG Chatbot with Groq & ChromaDB")
99
+
100
+ file_input = gr.File(label="Upload a PDF")
101
+ upload_button = gr.Button("Process Document")
102
+ upload_status = gr.Textbox(label="Upload Status", interactive=False)
103
+
104
+ query_input = gr.Textbox(label="Ask a Question")
105
+ response_output = gr.Textbox(label="Response", interactive=False)
106
+
107
+ chat_button = gr.Button("Get Answer")
108
+
109
+ upload_button.click(
110
+ upload_and_process,
111
+ inputs=[file_input],
112
+ outputs=[upload_status]
113
+ )
114
+ chat_button.click(
115
+ retrieve_and_generate_response, # Use the function directly
116
+ inputs=[query_input],
117
+ outputs=[response_output]
118
+ )
119
+
120
+
121
+ # Launch the Gradio app
122
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchain-groq
4
+ chromadb
5
+ sentence-transformers
6
+ gradio
7
+ unstructured
8
+ pdf2image
9
+ python-magic
10
+ pdfminer.six
11
+ nltk
12
+ transformers
13
+ torch
14
+ numpy
15
+ Pillow
16
+ pypdf
17
+ python-docx