Fakhruddin90 commited on
Commit
96f2e64
·
1 Parent(s): 24d1df2

Initial commit

Browse files
Files changed (2) hide show
  1. app.py +333 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Main application file for Hugging Face Space
2
+ import gradio as gr
3
+ import os
4
+ from typing import List, Tuple
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+ import faiss
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ import PyPDF2
10
+ import docx
11
+ import openai
12
+ import tempfile
13
+
14
+ class RAGChatbot:
15
+ def __init__(self):
16
+ """Initialize the RAG chatbot with embedding model and vector store."""
17
+ # Initialize embedding model
18
+ print("Loading embedding model...")
19
+ self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
20
+
21
+ # Initialize vector store (FAISS)
22
+ self.dimension = 384 # Dimension for all-MiniLM-L6-v2
23
+ self.index = faiss.IndexFlatL2(self.dimension)
24
+
25
+ # Store for document chunks
26
+ self.documents = []
27
+
28
+ # Text splitter for chunking documents
29
+ self.text_splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=500,
31
+ chunk_overlap=50,
32
+ length_function=len,
33
+ separators=["\n\n", "\n", " ", ""]
34
+ )
35
+
36
+ # Get OpenAI API key from Hugging Face Secrets
37
+ openai.api_key = os.getenv("OPENAI_API_KEY")
38
+
39
+ def read_pdf(self, file_path: str) -> str:
40
+ """Extract text from PDF file."""
41
+ text = ""
42
+ try:
43
+ with open(file_path, 'rb') as file:
44
+ pdf_reader = PyPDF2.PdfReader(file)
45
+ for page_num in range(len(pdf_reader.pages)):
46
+ page = pdf_reader.pages[page_num]
47
+ text += page.extract_text() or ""
48
+ except Exception as e:
49
+ print(f"Error reading PDF: {e}")
50
+ return text
51
+
52
+ def read_docx(self, file_path: str) -> str:
53
+ """Extract text from DOCX file."""
54
+ text = ""
55
+ try:
56
+ doc = docx.Document(file_path)
57
+ for paragraph in doc.paragraphs:
58
+ text += paragraph.text + "\n"
59
+ except Exception as e:
60
+ print(f"Error reading DOCX: {e}")
61
+ return text
62
+
63
+ def read_txt(self, file_path: str) -> str:
64
+ """Read text from TXT file."""
65
+ try:
66
+ with open(file_path, 'r', encoding='utf-8') as file:
67
+ return file.read()
68
+ except Exception as e:
69
+ print(f"Error reading TXT: {e}")
70
+ return ""
71
+
72
+ def process_documents(self, files) -> str:
73
+ """Process uploaded documents and add to vector store."""
74
+ if not files:
75
+ return "No files uploaded."
76
+
77
+ all_text = ""
78
+ processed_files = 0
79
+
80
+ for file in files:
81
+ try:
82
+ # Get file extension
83
+ file_path = file.name
84
+
85
+ # Read file based on extension
86
+ if file_path.endswith('.pdf'):
87
+ text = self.read_pdf(file_path)
88
+ elif file_path.endswith('.docx'):
89
+ text = self.read_docx(file_path)
90
+ elif file_path.endswith('.txt'):
91
+ text = self.read_txt(file_path)
92
+ else:
93
+ continue
94
+
95
+ all_text += text + "\n"
96
+ processed_files += 1
97
+ except Exception as e:
98
+ print(f"Error processing file {file.name}: {e}")
99
+ continue
100
+
101
+ if not all_text.strip():
102
+ return "No text content found in the uploaded documents."
103
+
104
+ # Split text into chunks
105
+ chunks = self.text_splitter.split_text(all_text)
106
+
107
+ if not chunks:
108
+ return "No text chunks created from documents."
109
+
110
+ # Create embeddings for chunks
111
+ embeddings = self.embedding_model.encode(chunks)
112
+
113
+ # Add to FAISS index
114
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
115
+ self.index.add(np.array([embedding]))
116
+ self.documents.append(chunk)
117
+
118
+ return f"✅ Successfully processed {len(chunks)} text chunks from {processed_files} documents."
119
+
120
+ def retrieve_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
121
+ """Retrieve k most relevant chunks for the query."""
122
+ if len(self.documents) == 0:
123
+ return []
124
+
125
+ # Create embedding for query
126
+ query_embedding = self.embedding_model.encode([query])
127
+
128
+ # Search in FAISS index
129
+ distances, indices = self.index.search(query_embedding, min(k, len(self.documents)))
130
+
131
+ # Get relevant documents
132
+ relevant_chunks = [self.documents[idx] for idx in indices[0] if idx < len(self.documents)]
133
+
134
+ return relevant_chunks
135
+
136
+ def generate_response(self, query: str, context: List[str]) -> str:
137
+ """Generate response using OpenAI API with retrieved context."""
138
+ if not openai.api_key:
139
+ return "⚠️ OpenAI API key not configured. Please add OPENAI_API_KEY to the Space secrets."
140
+
141
+ if not context:
142
+ return "No relevant documents found. Please upload documents first."
143
+
144
+ # Prepare context string
145
+ context_str = "\n\n".join(context[:3]) # Limit context to avoid token limits
146
+
147
+ # Create prompt
148
+ prompt = f"""You are a helpful assistant. Use the following context to answer the question.
149
+ If you cannot answer the question based on the context, say so.
150
+
151
+ Context:
152
+ {context_str}
153
+
154
+ Question: {query}
155
+
156
+ Answer:"""
157
+
158
+ try:
159
+ # Call OpenAI API (updated for new API)
160
+ from openai import OpenAI
161
+ client = OpenAI(api_key=openai.api_key)
162
+
163
+ response = client.chat.completions.create(
164
+ model="gpt-3.5-turbo",
165
+ messages=[
166
+ {"role": "system", "content": "You are a helpful assistant that answers questions based on provided context."},
167
+ {"role": "user", "content": prompt}
168
+ ],
169
+ max_tokens=500,
170
+ temperature=0.7
171
+ )
172
+
173
+ return response.choices[0].message.content
174
+ except Exception as e:
175
+ return f"Error generating response: {str(e)}"
176
+
177
+ def chat(self, message: str, history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
178
+ """Main chat function that combines retrieval and generation."""
179
+ if not message.strip():
180
+ return "", history
181
+
182
+ # Retrieve relevant chunks
183
+ relevant_chunks = self.retrieve_relevant_chunks(message)
184
+
185
+ # Generate response
186
+ response = self.generate_response(message, relevant_chunks)
187
+
188
+ # Update history
189
+ history.append((message, response))
190
+
191
+ return "", history
192
+
193
+ # Initialize the chatbot
194
+ print("Initializing RAG Chatbot...")
195
+ chatbot = RAGChatbot()
196
+
197
+ # Create Gradio interface
198
+ with gr.Blocks(title="RAG Chatbot", theme=gr.themes.Soft()) as demo:
199
+ gr.Markdown(
200
+ """
201
+ # 🤖 RAG Chatbot with Gradio
202
+
203
+ Upload your documents and start asking questions! The chatbot will retrieve relevant information from your documents to answer your queries.
204
+
205
+ **Supported formats:** PDF, DOCX, TXT | **Powered by:** Sentence-BERT + FAISS + OpenAI
206
+ """
207
+ )
208
+
209
+ with gr.Tab("📄 Upload Documents"):
210
+ file_upload = gr.File(
211
+ label="Upload Documents",
212
+ file_count="multiple",
213
+ file_types=[".pdf", ".docx", ".txt"]
214
+ )
215
+ upload_button = gr.Button("Process Documents", variant="primary")
216
+ upload_status = gr.Textbox(label="Status", interactive=False)
217
+
218
+ upload_button.click(
219
+ fn=chatbot.process_documents,
220
+ inputs=[file_upload],
221
+ outputs=[upload_status]
222
+ )
223
+
224
+ with gr.Tab("💬 Chat"):
225
+ chatbot_interface = gr.Chatbot(
226
+ label="Chat History",
227
+ height=400,
228
+ bubble_full_width=False
229
+ )
230
+
231
+ with gr.Row():
232
+ msg = gr.Textbox(
233
+ label="Your Question",
234
+ placeholder="Ask a question about your documents...",
235
+ lines=1,
236
+ scale=4
237
+ )
238
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
239
+
240
+ clear = gr.Button("🗑️ Clear Chat")
241
+
242
+ # Handle message submission
243
+ msg.submit(
244
+ fn=chatbot.chat,
245
+ inputs=[msg, chatbot_interface],
246
+ outputs=[msg, chatbot_interface]
247
+ )
248
+
249
+ submit_btn.click(
250
+ fn=chatbot.chat,
251
+ inputs=[msg, chatbot_interface],
252
+ outputs=[msg, chatbot_interface]
253
+ )
254
+
255
+ # Clear chat history
256
+ clear.click(
257
+ lambda: (None, []),
258
+ outputs=[msg, chatbot_interface]
259
+ )
260
+
261
+ with gr.Tab("⚙️ Settings"):
262
+ gr.Markdown(
263
+ """
264
+ ### Configuration
265
+
266
+ | Component | Details |
267
+ |-----------|---------|
268
+ | **Embedding Model** | all-MiniLM-L6-v2 |
269
+ | **Vector Store** | FAISS |
270
+ | **LLM** | OpenAI GPT-3.5-turbo |
271
+ | **Chunk Size** | 500 characters |
272
+ | **Chunk Overlap** | 50 characters |
273
+ | **Retrieved Chunks** | 3 |
274
+
275
+ ### About
276
+ This RAG chatbot uses retrieval-augmented generation to answer questions based on your uploaded documents.
277
+ """
278
+ )
279
+
280
+ # Launch the app
281
+ demo.launch()
282
+
283
+ # -----------------------------------
284
+ # requirements.txt - Dependencies file
285
+ """
286
+ gradio==4.19.2
287
+ sentence-transformers==2.3.1
288
+ faiss-cpu==1.7.4
289
+ langchain==0.1.6
290
+ openai==1.12.0
291
+ PyPDF2==3.0.1
292
+ python-docx==1.1.0
293
+ numpy==1.24.3
294
+ """
295
+
296
+ # -----------------------------------
297
+ # README.md - Documentation for your Space
298
+ """
299
+ ---
300
+ title: RAG Chatbot
301
+ emoji: 🤖
302
+ colorFrom: blue
303
+ colorTo: green
304
+ sdk: gradio
305
+ sdk_version: 4.19.2
306
+ app_file: app.py
307
+ pinned: false
308
+ license: mit
309
+ ---
310
+
311
+ # RAG Chatbot
312
+
313
+ A Retrieval-Augmented Generation chatbot built with Gradio, FAISS, and OpenAI.
314
+
315
+ ## Features
316
+ - Upload PDF, DOCX, and TXT documents
317
+ - Semantic search using Sentence-BERT embeddings
318
+ - Context-aware responses using OpenAI GPT-3.5
319
+ - Interactive chat interface
320
+
321
+ ## Setup
322
+ Add your OpenAI API key to the Space secrets:
323
+ 1. Go to Settings → Variables and secrets
324
+ 2. Add a new secret named `OPENAI_API_KEY`
325
+ 3. Paste your OpenAI API key
326
+
327
+ ## Usage
328
+ 1. Upload your documents in the Upload Documents tab
329
+ 2. Wait for processing confirmation
330
+ 3. Go to the Chat tab and start asking questions!
331
+
332
+ Check out the [GitHub repository](https://github.com/yourusername/rag-chatbot) for more details.
333
+ """
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==4.19.2
2
+ sentence-transformers==2.3.1
3
+ faiss-cpu==1.7.4
4
+ langchain==0.1.6
5
+ openai==1.12.0
6
+ PyPDF2==3.0.1
7
+ python-docx==1.1.0
8
+ numpy==1.24.3