Navneet Sai commited on
Commit
1bfb382
Β·
1 Parent(s): de95ad8

Initial RAG App

Browse files
Files changed (3) hide show
  1. README.md +32 -7
  2. app.py +291 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,13 +1,38 @@
1
  ---
2
- title: Rag Qa Document
3
- emoji: 🏒
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: RAG Document Q&A Assistant
3
+ emoji: πŸ“„
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # RAG Document Q&A Assistant
13
+
14
+ Upload a PDF or TXT document and ask questions about its content.
15
+
16
+ ## How It Works
17
+
18
+ 1. **Document Processing**: Your document is split into chunks using the selected strategy (fixed-size or paragraph-based)
19
+ 2. **Embedding**: Chunks are embedded using Sentence Transformers (all-MiniLM-L6-v2)
20
+ 3. **Retrieval**: When you ask a question, relevant chunks are retrieved using semantic search via ChromaDB
21
+ 4. **Generation**: GPT-4o-mini generates an answer based on the retrieved context
22
+
23
+ ## Features
24
+
25
+ - PDF and TXT file support
26
+ - Two chunking strategies for comparison
27
+ - Source citations with relevance scores
28
+ - Built with Gradio, ChromaDB, and OpenAI API
29
+
30
+ ## References
31
+
32
+ - [RAG Original Paper (Lewis et al., 2020)](https://arxiv.org/abs/2005.11401)
33
+ - [RAG Survey (Gao et al., 2023)](https://arxiv.org/pdf/2312.10997)
34
+ - [Chunking Strategies for RAG (Merola & Singh, 2025)](https://arxiv.org/abs/2504.19754)
35
+
36
+ ## Author
37
+
38
+ Built as part of an AI/ML Engineering portfolio project.
app.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG Document Q&A Assistant
3
+ Upload documents, ask questions, get answers with source citations.
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ from typing import Optional
9
+
10
+ import chromadb
11
+ import fitz # PyMuPDF
12
+ import gradio as gr
13
+ from chromadb.utils import embedding_functions
14
+ from openai import OpenAI
15
+
16
+ # Initialize OpenAI client
17
+ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
18
+
19
+ # Initialize embedding function
20
+ embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
21
+ model_name="all-MiniLM-L6-v2"
22
+ )
23
+
24
+ # Global state for the current session
25
+ chroma_client = None
26
+ collection = None
27
+ current_chunks = []
28
+
29
+
30
+ def extract_text_from_pdf(file_path: str) -> str:
31
+ """Extract text from PDF using PyMuPDF."""
32
+ doc = fitz.open(file_path)
33
+ text = ""
34
+ for page in doc:
35
+ text += page.get_text()
36
+ doc.close()
37
+ return text
38
+
39
+
40
+ def extract_text_from_txt(file_path: str) -> str:
41
+ """Extract text from TXT file."""
42
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
43
+ return f.read()
44
+
45
+
46
+ def chunk_fixed_size(text: str, chunk_size: int = 500, overlap: int = 100) -> list[dict]:
47
+ """Split text into fixed-size chunks with overlap."""
48
+ chunks = []
49
+ start = 0
50
+ chunk_id = 0
51
+
52
+ while start < len(text):
53
+ end = start + chunk_size
54
+ chunk_text = text[start:end].strip()
55
+
56
+ if chunk_text:
57
+ chunks.append({
58
+ "id": f"chunk_{chunk_id}",
59
+ "text": chunk_text,
60
+ "start": start,
61
+ "end": end
62
+ })
63
+ chunk_id += 1
64
+
65
+ start = end - overlap
66
+
67
+ return chunks
68
+
69
+
70
+ def chunk_by_paragraph(text: str) -> list[dict]:
71
+ """Split text by paragraphs (double newlines)."""
72
+ paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
73
+
74
+ chunks = []
75
+ for i, para in enumerate(paragraphs):
76
+ if len(para) > 50:
77
+ chunks.append({
78
+ "id": f"chunk_{i}",
79
+ "text": para,
80
+ "start": 0,
81
+ "end": 0
82
+ })
83
+
84
+ return chunks
85
+
86
+
87
+ def process_document(file, chunking_strategy: str) -> str:
88
+ """Process uploaded document and store in vector DB."""
89
+ global chroma_client, collection, current_chunks
90
+
91
+ if file is None:
92
+ return "❌ Please upload a document first."
93
+
94
+ file_path = file.name
95
+ file_ext = os.path.splitext(file_path)[1].lower()
96
+
97
+ try:
98
+ if file_ext == ".pdf":
99
+ text = extract_text_from_pdf(file_path)
100
+ elif file_ext in [".txt", ".md"]:
101
+ text = extract_text_from_txt(file_path)
102
+ else:
103
+ return f"❌ Unsupported file type: {file_ext}. Please upload PDF or TXT."
104
+ except Exception as e:
105
+ return f"❌ Error reading file: {str(e)}"
106
+
107
+ if not text.strip():
108
+ return "❌ No text could be extracted from the document."
109
+
110
+ if chunking_strategy == "Fixed-size (500 chars)":
111
+ current_chunks = chunk_fixed_size(text, chunk_size=500, overlap=100)
112
+ else:
113
+ current_chunks = chunk_by_paragraph(text)
114
+
115
+ if not current_chunks:
116
+ return "❌ No chunks could be created from the document."
117
+
118
+ # Initialize fresh Chroma client and collection
119
+ chroma_client = chromadb.Client()
120
+ try:
121
+ chroma_client.delete_collection(name="documents")
122
+ except:
123
+ pass
124
+ collection = chroma_client.create_collection(
125
+ name="documents",
126
+ embedding_function=embedding_func
127
+ )
128
+
129
+ collection.add(
130
+ documents=[c["text"] for c in current_chunks],
131
+ ids=[c["id"] for c in current_chunks]
132
+ )
133
+
134
+ return f"βœ… Document processed successfully!\n\nπŸ“Š **Stats:**\n- Characters: {len(text):,}\n- Chunks created: {len(current_chunks)}\n- Chunking strategy: {chunking_strategy}"
135
+
136
+
137
+ def retrieve_context(query: str, top_k: int = 3) -> list[dict]:
138
+ """Retrieve relevant chunks for the query."""
139
+ if collection is None:
140
+ return []
141
+
142
+ results = collection.query(
143
+ query_texts=[query],
144
+ n_results=top_k
145
+ )
146
+
147
+ retrieved = []
148
+ for i, (doc, distance) in enumerate(zip(
149
+ results["documents"][0],
150
+ results["distances"][0]
151
+ )):
152
+ similarity = 1 / (1 + distance)
153
+ retrieved.append({
154
+ "text": doc,
155
+ "similarity": similarity,
156
+ "rank": i + 1
157
+ })
158
+
159
+ return retrieved
160
+
161
+
162
+ def generate_answer(query: str, context_docs: list[dict]) -> str:
163
+ """Generate answer using OpenAI with retrieved context."""
164
+ if not context_docs:
165
+ return "I don't have any context to answer this question. Please upload a document first."
166
+
167
+ context = "\n\n".join([
168
+ f"[Source {doc['rank']}] (relevance: {doc['similarity']:.0%})\n{doc['text']}"
169
+ for doc in context_docs
170
+ ])
171
+
172
+ prompt = f"""Answer the question based on the provided context.
173
+ If the context doesn't contain enough information to answer fully, say so.
174
+ Always reference which source(s) you used.
175
+
176
+ CONTEXT:
177
+ {context}
178
+
179
+ QUESTION: {query}
180
+
181
+ ANSWER:"""
182
+
183
+ try:
184
+ response = openai_client.chat.completions.create(
185
+ model="gpt-4o-mini",
186
+ messages=[
187
+ {"role": "system", "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and cite your sources."},
188
+ {"role": "user", "content": prompt}
189
+ ],
190
+ temperature=0.3,
191
+ max_tokens=500
192
+ )
193
+ return response.choices[0].message.content
194
+ except Exception as e:
195
+ return f"❌ Error generating answer: {str(e)}"
196
+
197
+
198
+ def ask_question(query: str) -> tuple[str, str]:
199
+ """Main function to handle user questions."""
200
+ if not query.strip():
201
+ return "Please enter a question.", ""
202
+
203
+ if collection is None:
204
+ return "Please upload and process a document first.", ""
205
+
206
+ retrieved = retrieve_context(query, top_k=3)
207
+ answer = generate_answer(query, retrieved)
208
+
209
+ sources = "\n\n---\n\n**πŸ“š Retrieved Sources:**\n\n"
210
+ for doc in retrieved:
211
+ sources += f"**[Source {doc['rank']}]** (relevance: {doc['similarity']:.0%})\n"
212
+ sources += f"```\n{doc['text'][:300]}{'...' if len(doc['text']) > 300 else ''}\n```\n\n"
213
+
214
+ return answer, sources
215
+
216
+
217
+ # Build Gradio interface
218
+ with gr.Blocks(title="RAG Document Q&A", theme=gr.themes.Soft()) as demo:
219
+ gr.Markdown("""
220
+ # πŸ“„ RAG Document Q&A Assistant
221
+
222
+ Upload a document (PDF or TXT), choose a chunking strategy, and ask questions!
223
+
224
+ **How it works:**
225
+ 1. Your document is split into chunks using the selected strategy
226
+ 2. Chunks are embedded using Sentence Transformers (all-MiniLM-L6-v2)
227
+ 3. When you ask a question, relevant chunks are retrieved using semantic search
228
+ 4. GPT-4o-mini generates an answer based on the retrieved context
229
+
230
+ ---
231
+ """)
232
+
233
+ with gr.Row():
234
+ with gr.Column(scale=1):
235
+ gr.Markdown("### πŸ“€ Step 1: Upload Document")
236
+ file_input = gr.File(
237
+ label="Upload PDF or TXT",
238
+ file_types=[".pdf", ".txt", ".md"]
239
+ )
240
+ chunking_dropdown = gr.Dropdown(
241
+ choices=["Fixed-size (500 chars)", "Paragraph-based"],
242
+ value="Paragraph-based",
243
+ label="Chunking Strategy"
244
+ )
245
+ process_btn = gr.Button("Process Document", variant="primary")
246
+ process_output = gr.Markdown(label="Processing Status")
247
+
248
+ with gr.Column(scale=2):
249
+ gr.Markdown("### πŸ’¬ Step 2: Ask Questions")
250
+ question_input = gr.Textbox(
251
+ label="Your Question",
252
+ placeholder="What is this document about?",
253
+ lines=2
254
+ )
255
+ ask_btn = gr.Button("Ask", variant="primary")
256
+
257
+ answer_output = gr.Markdown(label="Answer")
258
+ sources_output = gr.Markdown(label="Sources")
259
+
260
+ gr.Markdown("""
261
+ ---
262
+
263
+ **πŸ“š References:**
264
+ - [RAG Original Paper (Lewis et al., 2020)](https://arxiv.org/abs/2005.11401)
265
+ - [RAG Survey (Gao et al., 2023)](https://arxiv.org/pdf/2312.10997)
266
+ - [Chunking Strategies for RAG (Merola & Singh, 2025)](https://arxiv.org/abs/2504.19754)
267
+
268
+ Built as part of an AI/ML Engineering portfolio project.
269
+ """)
270
+
271
+ process_btn.click(
272
+ fn=process_document,
273
+ inputs=[file_input, chunking_dropdown],
274
+ outputs=[process_output]
275
+ )
276
+
277
+ ask_btn.click(
278
+ fn=ask_question,
279
+ inputs=[question_input],
280
+ outputs=[answer_output, sources_output]
281
+ )
282
+
283
+ question_input.submit(
284
+ fn=ask_question,
285
+ inputs=[question_input],
286
+ outputs=[answer_output, sources_output]
287
+ )
288
+
289
+
290
+ if __name__ == "__main__":
291
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ chromadb>=0.4.0
3
+ sentence-transformers>=2.2.0
4
+ openai>=1.0.0
5
+ pymupdf>=1.23.0