Sakil commited on
Commit
b19fc6b
Β·
verified Β·
1 Parent(s): c413294

created app.py file

Browse files
Files changed (1) hide show
  1. app.py +306 -0
app.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """final_app
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1pG3uDsJzglvQecdTcY76aXa5ObFadRux
8
+ """
9
+
10
+ # !pip install gradio langchain langchain-community langchain-huggingface langchain-groq faiss-cpu sentence-transformers pypdf
11
+
12
+
13
+
14
+ import gradio as gr
15
+ import os
16
+ import tempfile
17
+ from langchain_community.document_loaders import PyPDFLoader
18
+ from langchain_community.vectorstores import FAISS
19
+ from langchain_huggingface import HuggingFaceEmbeddings
20
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
21
+ from langchain_groq import ChatGroq
22
+ from langchain.chains import RetrievalQA
23
+ from langchain.prompts import PromptTemplate
24
+
25
+ # Groq API Key
26
+ GROQ_API_KEY = "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"
27
+ os.environ["GROQ_API_KEY"] = GROQ_API_KEY
28
+
29
+ # Global variables to store vectorstore and processed files
30
+ vectorstore = None
31
+ processed_files_list = []
32
+
33
+ def process_pdfs(files):
34
+ """Process uploaded PDF files and create vector store"""
35
+ global vectorstore, processed_files_list
36
+
37
+ if not files:
38
+ return "⚠️ Please upload at least one PDF file", ""
39
+
40
+ try:
41
+ all_documents = []
42
+ processed_names = []
43
+
44
+ # Process each uploaded PDF
45
+ for file in files:
46
+ # Load PDF
47
+ loader = PyPDFLoader(file.name)
48
+ documents = loader.load()
49
+ all_documents.extend(documents)
50
+ processed_names.append(os.path.basename(file.name))
51
+
52
+ if not all_documents:
53
+ return "❌ No content extracted from PDFs", ""
54
+
55
+ # Split documents into chunks
56
+ text_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=1000,
58
+ chunk_overlap=200,
59
+ length_function=len
60
+ )
61
+ splits = text_splitter.split_documents(all_documents)
62
+
63
+ # Create embeddings
64
+ embeddings = HuggingFaceEmbeddings(
65
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
66
+ model_kwargs={'device': 'cpu'}
67
+ )
68
+
69
+ # Create vector store
70
+ vectorstore = FAISS.from_documents(splits, embeddings)
71
+ processed_files_list = processed_names
72
+
73
+ success_msg = f"βœ… Successfully processed {len(files)} document(s)!\n"
74
+ success_msg += f"πŸ“Š Created {len(splits)} text chunks for retrieval\n\n"
75
+ success_msg += "πŸ“„ Processed files:\n" + "\n".join([f" β€’ {name}" for name in processed_names])
76
+
77
+ return success_msg, "βœ… Documents processed! You can now ask questions."
78
+
79
+ except Exception as e:
80
+ return f"❌ Error processing documents: {str(e)}", ""
81
+
82
+ def answer_question(question, chat_history):
83
+ """Answer questions based on the processed documents"""
84
+ global vectorstore
85
+
86
+ if not vectorstore:
87
+ return chat_history + [[question, "⚠️ Please upload and process PDF documents first!"]]
88
+
89
+ if not question or question.strip() == "":
90
+ return chat_history + [[question, "⚠️ Please enter a valid question."]]
91
+
92
+ try:
93
+ # Initialize LLM with stricter temperature for factual answers
94
+ llm = ChatGroq(
95
+ model="llama-3.1-8b-instant",
96
+ temperature=0, # Set to 0 for most deterministic, factual responses
97
+ max_tokens=1024,
98
+ api_key=GROQ_API_KEY
99
+ )
100
+
101
+ # Create custom prompt with strict context-only answering
102
+ prompt_template = """You are a helpful assistant that answers questions ONLY based on the provided context from uploaded PDF documents.
103
+
104
+ CRITICAL INSTRUCTIONS:
105
+ - Answer ONLY if the information is present in the context below
106
+ - If the context does not contain relevant information to answer the question, you MUST respond with: "I don't know the answer. This information is not available in the uploaded documents."
107
+ - DO NOT use any external knowledge or information not present in the context
108
+ - DO NOT make assumptions or inferences beyond what is explicitly stated in the context
109
+ - If you're unsure whether the context contains the answer, say you don't know
110
+
111
+ Context from uploaded documents:
112
+ {context}
113
+
114
+ Question: {question}
115
+
116
+ Answer (only from the context above):"""
117
+
118
+ PROMPT = PromptTemplate(
119
+ template=prompt_template,
120
+ input_variables=["context", "question"]
121
+ )
122
+
123
+ # Create retrieval chain with enhanced retrieval settings
124
+ qa_chain = RetrievalQA.from_chain_type(
125
+ llm=llm,
126
+ chain_type="stuff",
127
+ retriever=vectorstore.as_retriever(
128
+ search_type="similarity",
129
+ search_kwargs={
130
+ "k": 5, # Retrieve top 5 most relevant chunks
131
+ "fetch_k": 20 # Fetch more candidates before filtering
132
+ }
133
+ ),
134
+ chain_type_kwargs={"prompt": PROMPT},
135
+ return_source_documents=True
136
+ )
137
+
138
+ # Get response
139
+ result = qa_chain({"query": question})
140
+ answer = result['result']
141
+ source_docs = result.get('source_documents', [])
142
+
143
+ # Add source information if available
144
+ if source_docs and "don't know" not in answer.lower():
145
+ answer += "\n\nπŸ“Œ **Sources found in documents:**"
146
+ unique_sources = set()
147
+ for doc in source_docs[:3]: # Show top 3 sources
148
+ source = doc.metadata.get('source', 'Unknown')
149
+ page = doc.metadata.get('page', 'Unknown')
150
+ source_id = f"{source} (Page {page})"
151
+ if source_id not in unique_sources:
152
+ unique_sources.add(source_id)
153
+
154
+ for source in unique_sources:
155
+ answer += f"\n β€’ {source}"
156
+
157
+ # Update chat history
158
+ chat_history = chat_history + [[question, answer]]
159
+
160
+ return chat_history
161
+
162
+ except Exception as e:
163
+ error_msg = f"❌ Error generating answer: {str(e)}"
164
+ return chat_history + [[question, error_msg]]
165
+
166
+ def clear_data():
167
+ """Clear all processed data"""
168
+ global vectorstore, processed_files_list
169
+ vectorstore = None
170
+ processed_files_list = []
171
+ return "πŸ—‘οΈ All data cleared. Please upload new documents.", "", []
172
+
173
+ # Custom CSS for better styling
174
+ custom_css = """
175
+ #title {
176
+ text-align: center;
177
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
178
+ -webkit-background-clip: text;
179
+ -webkit-text-fill-color: transparent;
180
+ font-size: 2.5em;
181
+ font-weight: bold;
182
+ margin-bottom: 10px;
183
+ }
184
+ #subtitle {
185
+ text-align: center;
186
+ color: #666;
187
+ font-size: 1.2em;
188
+ margin-bottom: 20px;
189
+ }
190
+ .gradio-container {
191
+ max-width: 1200px !important;
192
+ margin: auto !important;
193
+ }
194
+ """
195
+
196
+ # Create Gradio interface
197
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
198
+ # Header
199
+ gr.HTML("<h1 id='title'>πŸ“š Slashbyte RAG</h1>")
200
+ gr.HTML("<p id='subtitle'>Upload PDFs and ask questions using AI-powered retrieval</p>")
201
+
202
+ with gr.Row():
203
+ # Left column - Document Upload
204
+ with gr.Column(scale=1):
205
+ gr.Markdown("### πŸ“„ Document Upload")
206
+ file_upload = gr.File(
207
+ label="Upload PDF Documents",
208
+ file_types=[".pdf"],
209
+ file_count="multiple"
210
+ )
211
+ process_btn = gr.Button("πŸ”„ Process Documents", variant="primary", size="lg")
212
+ process_output = gr.Textbox(
213
+ label="Processing Status",
214
+ lines=8,
215
+ interactive=False
216
+ )
217
+ clear_btn = gr.Button("πŸ—‘οΈ Clear All Data", variant="stop")
218
+
219
+ gr.Markdown("""
220
+ ---
221
+ ### ℹ️ How to Use
222
+ 1. **Upload PDFs** using the file uploader
223
+ 2. Click **Process Documents**
224
+ 3. **Ask questions** in the chat
225
+ 4. Get **AI-powered answers**
226
+
227
+ **Features:**
228
+ - πŸ“„ Multiple PDF support
229
+ - πŸ€– Powered by Groq LLM
230
+ - πŸ” Semantic search
231
+ - πŸ’Ύ Chat history
232
+ """)
233
+
234
+ # Right column - Chat Interface
235
+ with gr.Column(scale=2):
236
+ gr.Markdown("### πŸ’¬ Ask Questions")
237
+ status_text = gr.Textbox(
238
+ label="Status",
239
+ value="⚠️ Upload and process documents to start",
240
+ interactive=False
241
+ )
242
+ chatbot = gr.Chatbot(
243
+ label="Chat History",
244
+ height=400,
245
+ show_label=True
246
+ )
247
+ with gr.Row():
248
+ question_input = gr.Textbox(
249
+ label="Your Question",
250
+ placeholder="Ask anything about your documents...",
251
+ scale=4
252
+ )
253
+ submit_btn = gr.Button("πŸš€ Ask", variant="primary", scale=1)
254
+
255
+ clear_chat_btn = gr.Button("🧹 Clear Chat")
256
+
257
+ # Footer
258
+ gr.HTML("""
259
+ <div style='text-align: center; color: #666; padding: 20px; margin-top: 20px; border-top: 1px solid #ddd;'>
260
+ <p>Powered by Langchain, Groq, and HuggingFace | Built with ❀️ using Gradio</p>
261
+ </div>
262
+ """)
263
+
264
+ # Event handlers
265
+ process_btn.click(
266
+ fn=process_pdfs,
267
+ inputs=[file_upload],
268
+ outputs=[process_output, status_text]
269
+ )
270
+
271
+ submit_btn.click(
272
+ fn=answer_question,
273
+ inputs=[question_input, chatbot],
274
+ outputs=[chatbot]
275
+ ).then(
276
+ lambda: "",
277
+ outputs=[question_input]
278
+ )
279
+
280
+ question_input.submit(
281
+ fn=answer_question,
282
+ inputs=[question_input, chatbot],
283
+ outputs=[chatbot]
284
+ ).then(
285
+ lambda: "",
286
+ outputs=[question_input]
287
+ )
288
+
289
+ clear_chat_btn.click(
290
+ fn=lambda: [],
291
+ outputs=[chatbot]
292
+ )
293
+
294
+ clear_btn.click(
295
+ fn=clear_data,
296
+ outputs=[process_output, status_text, chatbot]
297
+ )
298
+
299
+ # Launch the app
300
+ if __name__ == "__main__":
301
+ demo.launch(
302
+ share=True,
303
+ server_name="0.0.0.0",
304
+ server_port=7860
305
+ )
306
+