Sakil commited on
Commit
be5e1bb
Β·
verified Β·
1 Parent(s): 4e0e3fd

created app file

Browse files
Files changed (1) hide show
  1. app.py +298 -0
app.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """final_app
3
+ Automatically generated by Colab.
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/1pG3uDsJzglvQecdTcY76aXa5ObFadRux
6
+ """
7
+
8
+ # !pip install gradio langchain langchain-community langchain-huggingface langchain-groq faiss-cpu sentence-transformers pypdf
9
+
10
+
11
+
12
+ import gradio as gr
13
+ import os
14
+ import tempfile
15
+ from langchain_community.document_loaders import PyPDFLoader
16
+ from langchain_community.vectorstores import FAISS
17
+ from langchain_huggingface import HuggingFaceEmbeddings
18
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+ from langchain_groq import ChatGroq
20
+ from langchain.chains import RetrievalQA
21
+ from langchain.prompts import PromptTemplate
22
+
23
+ # Groq API Key
24
+ GROQ_API_KEY = "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"
25
+ os.environ["GROQ_API_KEY"] = GROQ_API_KEY
26
+
27
+ # Global variables to store vectorstore and processed files
28
+ vectorstore = None
29
+ processed_files_list = []
30
+
31
+ def process_pdfs(files):
32
+ """Process uploaded PDF files and create vector store"""
33
+ global vectorstore, processed_files_list
34
+
35
+ if not files:
36
+ return "⚠️ Please upload at least one PDF file", ""
37
+
38
+ try:
39
+ all_documents = []
40
+ processed_names = []
41
+
42
+ # Process each uploaded PDF
43
+ for file in files:
44
+ # Load PDF
45
+ loader = PyPDFLoader(file.name)
46
+ documents = loader.load()
47
+ all_documents.extend(documents)
48
+ processed_names.append(os.path.basename(file.name))
49
+
50
+ if not all_documents:
51
+ return "❌ No content extracted from PDFs", ""
52
+
53
+ # Split documents into chunks
54
+ text_splitter = RecursiveCharacterTextSplitter(
55
+ chunk_size=1000,
56
+ chunk_overlap=200,
57
+ length_function=len
58
+ )
59
+ splits = text_splitter.split_documents(all_documents)
60
+
61
+ # Create embeddings
62
+ embeddings = HuggingFaceEmbeddings(
63
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
64
+ model_kwargs={'device': 'cpu'}
65
+ )
66
+
67
+ # Create vector store
68
+ vectorstore = FAISS.from_documents(splits, embeddings)
69
+ processed_files_list = processed_names
70
+
71
+ success_msg = f"βœ… Successfully processed {len(files)} document(s)!\n"
72
+ success_msg += f"πŸ“Š Created {len(splits)} text chunks for retrieval\n\n"
73
+ success_msg += "πŸ“„ Processed files:\n" + "\n".join([f" β€’ {name}" for name in processed_names])
74
+
75
+ return success_msg, "βœ… Documents processed! You can now ask questions."
76
+
77
+ except Exception as e:
78
+ return f"❌ Error processing documents: {str(e)}", ""
79
+
80
+ def answer_question(question, chat_history):
81
+ """Answer questions based on the processed documents"""
82
+ global vectorstore
83
+
84
+ if not vectorstore:
85
+ return chat_history + [[question, "⚠️ Please upload and process PDF documents first!"]]
86
+
87
+ if not question or question.strip() == "":
88
+ return chat_history + [[question, "⚠️ Please enter a valid question."]]
89
+
90
+ try:
91
+ # Initialize LLM with stricter temperature for factual answers
92
+ llm = ChatGroq(
93
+ model="llama-3.1-8b-instant",
94
+ temperature=0, # Set to 0 for most deterministic, factual responses
95
+ max_tokens=1024,
96
+ api_key=GROQ_API_KEY
97
+ )
98
+
99
+ # Create custom prompt with strict context-only answering
100
+ prompt_template = """You are a helpful assistant that answers questions ONLY based on the provided context from uploaded PDF documents.
101
+ CRITICAL INSTRUCTIONS:
102
+ - Answer ONLY if the information is present in the context below
103
+ - If the context does not contain relevant information to answer the question, you MUST respond with: "I don't know the answer. This information is not available in the uploaded documents."
104
+ - DO NOT use any external knowledge or information not present in the context
105
+ - DO NOT make assumptions or inferences beyond what is explicitly stated in the context
106
+ - If you're unsure whether the context contains the answer, say you don't know
107
+ Context from uploaded documents:
108
+ {context}
109
+ Question: {question}
110
+ Answer (only from the context above):"""
111
+
112
+ PROMPT = PromptTemplate(
113
+ template=prompt_template,
114
+ input_variables=["context", "question"]
115
+ )
116
+
117
+ # Create retrieval chain with enhanced retrieval settings
118
+ qa_chain = RetrievalQA.from_chain_type(
119
+ llm=llm,
120
+ chain_type="stuff",
121
+ retriever=vectorstore.as_retriever(
122
+ search_type="similarity",
123
+ search_kwargs={
124
+ "k": 5, # Retrieve top 5 most relevant chunks
125
+ "fetch_k": 20 # Fetch more candidates before filtering
126
+ }
127
+ ),
128
+ chain_type_kwargs={"prompt": PROMPT},
129
+ return_source_documents=True
130
+ )
131
+
132
+ # Get response
133
+ result = qa_chain({"query": question})
134
+ answer = result['result']
135
+ source_docs = result.get('source_documents', [])
136
+
137
+ # Add source information if available
138
+ if source_docs and "don't know" not in answer.lower():
139
+ answer += "\n\nπŸ“Œ **Sources found in documents:**"
140
+ unique_sources = set()
141
+ for doc in source_docs[:3]: # Show top 3 sources
142
+ source = doc.metadata.get('source', 'Unknown')
143
+ page = doc.metadata.get('page', 'Unknown')
144
+ source_id = f"{source} (Page {page})"
145
+ if source_id not in unique_sources:
146
+ unique_sources.add(source_id)
147
+
148
+ for source in unique_sources:
149
+ answer += f"\n β€’ {source}"
150
+
151
+ # Update chat history
152
+ chat_history = chat_history + [[question, answer]]
153
+
154
+ return chat_history
155
+
156
+ except Exception as e:
157
+ error_msg = f"❌ Error generating answer: {str(e)}"
158
+ return chat_history + [[question, error_msg]]
159
+
160
+ def clear_data():
161
+ """Clear all processed data"""
162
+ global vectorstore, processed_files_list
163
+ vectorstore = None
164
+ processed_files_list = []
165
+ return "πŸ—‘οΈ All data cleared. Please upload new documents.", "", []
166
+
167
+ # Custom CSS for better styling
168
+ custom_css = """
169
+ #title {
170
+ text-align: center;
171
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
172
+ -webkit-background-clip: text;
173
+ -webkit-text-fill-color: transparent;
174
+ font-size: 2.5em;
175
+ font-weight: bold;
176
+ margin-bottom: 10px;
177
+ }
178
+ #subtitle {
179
+ text-align: center;
180
+ color: #666;
181
+ font-size: 1.2em;
182
+ margin-bottom: 20px;
183
+ }
184
+ .gradio-container {
185
+ max-width: 1200px !important;
186
+ margin: auto !important;
187
+ }
188
+ """
189
+
190
+ # Create Gradio interface
191
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
192
+ # Header
193
+ gr.HTML("<h1 id='title'>πŸ“š Slashbyte RAG</h1>")
194
+ gr.HTML("<p id='subtitle'>Upload PDFs and ask questions using AI-powered retrieval</p>")
195
+
196
+ with gr.Row():
197
+ # Left column - Document Upload
198
+ with gr.Column(scale=1):
199
+ gr.Markdown("### πŸ“„ Document Upload")
200
+ file_upload = gr.File(
201
+ label="Upload PDF Documents",
202
+ file_types=[".pdf"],
203
+ file_count="multiple"
204
+ )
205
+ process_btn = gr.Button("πŸ”„ Process Documents", variant="primary", size="lg")
206
+ process_output = gr.Textbox(
207
+ label="Processing Status",
208
+ lines=8,
209
+ interactive=False
210
+ )
211
+ clear_btn = gr.Button("πŸ—‘οΈ Clear All Data", variant="stop")
212
+
213
+ gr.Markdown("""
214
+ ---
215
+ ### ℹ️ How to Use
216
+ 1. **Upload PDFs** using the file uploader
217
+ 2. Click **Process Documents**
218
+ 3. **Ask questions** in the chat
219
+ 4. Get **AI-powered answers**
220
+ **Features:**
221
+ - πŸ“„ Multiple PDF support
222
+ - πŸ€– Powered by Groq LLM
223
+ - πŸ” Semantic search
224
+ - πŸ’Ύ Chat history
225
+ """)
226
+
227
+ # Right column - Chat Interface
228
+ with gr.Column(scale=2):
229
+ gr.Markdown("### πŸ’¬ Ask Questions")
230
+ status_text = gr.Textbox(
231
+ label="Status",
232
+ value="⚠️ Upload and process documents to start",
233
+ interactive=False
234
+ )
235
+ chatbot = gr.Chatbot(
236
+ label="Chat History",
237
+ height=400,
238
+ show_label=True
239
+ )
240
+ with gr.Row():
241
+ question_input = gr.Textbox(
242
+ label="Your Question",
243
+ placeholder="Ask anything about your documents...",
244
+ scale=4
245
+ )
246
+ submit_btn = gr.Button("πŸš€ Ask", variant="primary", scale=1)
247
+
248
+ clear_chat_btn = gr.Button("🧹 Clear Chat")
249
+
250
+ # Footer
251
+ gr.HTML("""
252
+ <div style='text-align: center; color: #666; padding: 20px; margin-top: 20px; border-top: 1px solid #ddd;'>
253
+ <p>Powered by Langchain, Groq, and HuggingFace | Built with ❀️ using Gradio</p>
254
+ </div>
255
+ """)
256
+
257
+ # Event handlers
258
+ process_btn.click(
259
+ fn=process_pdfs,
260
+ inputs=[file_upload],
261
+ outputs=[process_output, status_text]
262
+ )
263
+
264
+ submit_btn.click(
265
+ fn=answer_question,
266
+ inputs=[question_input, chatbot],
267
+ outputs=[chatbot]
268
+ ).then(
269
+ lambda: "",
270
+ outputs=[question_input]
271
+ )
272
+
273
+ question_input.submit(
274
+ fn=answer_question,
275
+ inputs=[question_input, chatbot],
276
+ outputs=[chatbot]
277
+ ).then(
278
+ lambda: "",
279
+ outputs=[question_input]
280
+ )
281
+
282
+ clear_chat_btn.click(
283
+ fn=lambda: [],
284
+ outputs=[chatbot]
285
+ )
286
+
287
+ clear_btn.click(
288
+ fn=clear_data,
289
+ outputs=[process_output, status_text, chatbot]
290
+ )
291
+
292
+ # Launch the app
293
+ if __name__ == "__main__":
294
+ demo.launch(
295
+ share=True,
296
+ server_name="0.0.0.0",
297
+ server_port=7860
298
+ )