Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- Dockerfile +25 -0
- app.py +332 -0
- config.py +22 -0
- pdf_processor.py +98 -0
- rag_engine.py +90 -0
- requirements.txt +7 -0
- vector_db/.DS_Store +0 -0
- vector_store.py +115 -0
Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
gcc \
|
| 8 |
+
g++ \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements and install Python dependencies
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy application code
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
# Create necessary directories
|
| 19 |
+
RUN mkdir -p uploads vector_db
|
| 20 |
+
|
| 21 |
+
# Expose port
|
| 22 |
+
EXPOSE 7860
|
| 23 |
+
|
| 24 |
+
# Run the application
|
| 25 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import uuid
|
| 4 |
+
import tempfile
|
| 5 |
+
from typing import List, Tuple, Optional
|
| 6 |
+
from config import Config
|
| 7 |
+
from pdf_processor import PDFProcessor
|
| 8 |
+
from vector_store import VectorStore
|
| 9 |
+
from rag_engine import RAGEngine
|
| 10 |
+
|
| 11 |
+
# Initialize components
|
| 12 |
+
pdf_processor = PDFProcessor(
|
| 13 |
+
chunk_size=Config.CHUNK_SIZE,
|
| 14 |
+
chunk_overlap=Config.CHUNK_OVERLAP
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
vector_store = VectorStore(
|
| 18 |
+
model_name=Config.EMBEDDING_MODEL,
|
| 19 |
+
vector_db_path=Config.VECTOR_DB_PATH
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
rag_engine = RAGEngine(vector_store)
|
| 23 |
+
|
| 24 |
+
def upload_and_process_pdfs(files: List[tempfile._TemporaryFileWrapper]) -> str:
|
| 25 |
+
"""Process uploaded PDF files and add them to the vector store."""
|
| 26 |
+
if not files:
|
| 27 |
+
return "❌ No files uploaded."
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
uploaded_files = []
|
| 31 |
+
total_chunks = 0
|
| 32 |
+
|
| 33 |
+
for file in files:
|
| 34 |
+
if file is None:
|
| 35 |
+
continue
|
| 36 |
+
|
| 37 |
+
file_path = file.name
|
| 38 |
+
filename = os.path.basename(file_path)
|
| 39 |
+
|
| 40 |
+
# Check if it's a PDF
|
| 41 |
+
if not filename.lower().endswith('.pdf'):
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
# Process PDF
|
| 45 |
+
chunks = pdf_processor.extract_text_from_pdf(file_path)
|
| 46 |
+
|
| 47 |
+
# Add to vector store
|
| 48 |
+
vector_store.add_documents(chunks)
|
| 49 |
+
|
| 50 |
+
uploaded_files.append(filename)
|
| 51 |
+
total_chunks += len(chunks)
|
| 52 |
+
|
| 53 |
+
if uploaded_files:
|
| 54 |
+
stats = vector_store.get_stats()
|
| 55 |
+
return f"✅ Successfully processed {len(uploaded_files)} PDF(s):\n" + \
|
| 56 |
+
f"📄 Files: {', '.join(uploaded_files)}\n" + \
|
| 57 |
+
f"📊 Total chunks created: {total_chunks}\n" + \
|
| 58 |
+
f"🗃️ Database now contains {stats['total_documents']} total documents"
|
| 59 |
+
else:
|
| 60 |
+
return "❌ No valid PDF files found."
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return f"❌ Error processing files: {str(e)}"
|
| 64 |
+
|
| 65 |
+
def get_database_stats() -> str:
|
| 66 |
+
"""Get current database statistics."""
|
| 67 |
+
stats = vector_store.get_stats()
|
| 68 |
+
return f"📊 **Database Statistics**\n\n" + \
|
| 69 |
+
f"📄 Total Documents: {stats['total_documents']}\n" + \
|
| 70 |
+
f"🔍 Index Size: {stats['index_size']}\n" + \
|
| 71 |
+
f"📏 Vector Dimension: {stats.get('dimension', 'N/A')}"
|
| 72 |
+
|
| 73 |
+
def clear_database() -> str:
|
| 74 |
+
"""Clear the entire vector database."""
|
| 75 |
+
try:
|
| 76 |
+
vector_store.clear_index()
|
| 77 |
+
return "✅ Database cleared successfully!"
|
| 78 |
+
except Exception as e:
|
| 79 |
+
return f"❌ Error clearing database: {str(e)}"
|
| 80 |
+
|
| 81 |
+
def respond(message: str, chat_history: List[dict]) -> Tuple[str, List[dict]]:
|
| 82 |
+
"""Chat function that handles the new messages format."""
|
| 83 |
+
if not message.strip():
|
| 84 |
+
return "", chat_history
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
# Get response from RAG engine
|
| 88 |
+
result = rag_engine.generate_answer(message, top_k=Config.TOP_K)
|
| 89 |
+
|
| 90 |
+
response = result['answer']
|
| 91 |
+
sources = result.get('sources', [])
|
| 92 |
+
|
| 93 |
+
# Add source information to response
|
| 94 |
+
if sources:
|
| 95 |
+
response += "\n\n**📚 Sources:**\n"
|
| 96 |
+
for i, source in enumerate(sources[:3], 1):
|
| 97 |
+
response += f"{i}. 📄 **{source['source_file']}** (Page {source['page_number']})\n"
|
| 98 |
+
response += f" 📝 _{source['content_preview']}_\n"
|
| 99 |
+
|
| 100 |
+
# Add user message to chat history
|
| 101 |
+
chat_history.append({"role": "user", "content": message})
|
| 102 |
+
|
| 103 |
+
# Add assistant response to chat history
|
| 104 |
+
chat_history.append({"role": "assistant", "content": response})
|
| 105 |
+
|
| 106 |
+
return "", chat_history
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
error_response = f"❌ Error: {str(e)}"
|
| 110 |
+
|
| 111 |
+
# Add user message and error response to chat history
|
| 112 |
+
chat_history.append({"role": "user", "content": message})
|
| 113 |
+
chat_history.append({"role": "assistant", "content": error_response})
|
| 114 |
+
|
| 115 |
+
return "", chat_history
|
| 116 |
+
|
| 117 |
+
def create_interface():
|
| 118 |
+
"""Create the Gradio interface."""
|
| 119 |
+
|
| 120 |
+
with gr.Blocks(title="PDF RAG System") as interface:
|
| 121 |
+
|
| 122 |
+
# Header
|
| 123 |
+
gr.Markdown("# 🤖 PDF RAG Assistant")
|
| 124 |
+
gr.Markdown("Upload PDFs and ask intelligent questions about their content using AI")
|
| 125 |
+
|
| 126 |
+
with gr.Tabs():
|
| 127 |
+
|
| 128 |
+
# Tab 1: Document Management
|
| 129 |
+
with gr.Tab("📁 Document Management"):
|
| 130 |
+
|
| 131 |
+
with gr.Row():
|
| 132 |
+
with gr.Column(scale=2):
|
| 133 |
+
gr.Markdown("## 📤 Upload PDF Documents")
|
| 134 |
+
gr.Markdown("Drag and drop your PDF files or click to browse")
|
| 135 |
+
|
| 136 |
+
file_upload = gr.File(
|
| 137 |
+
file_count="multiple",
|
| 138 |
+
file_types=[".pdf"],
|
| 139 |
+
label="Select PDF files to upload"
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
upload_btn = gr.Button(
|
| 143 |
+
"🚀 Process PDFs",
|
| 144 |
+
variant="primary",
|
| 145 |
+
size="lg"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
upload_status = gr.Textbox(
|
| 149 |
+
label="📊 Upload Status",
|
| 150 |
+
interactive=False,
|
| 151 |
+
max_lines=8
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
with gr.Column(scale=1):
|
| 155 |
+
gr.Markdown("## 🗄️ Database Management")
|
| 156 |
+
|
| 157 |
+
stats_display = gr.Markdown(get_database_stats())
|
| 158 |
+
|
| 159 |
+
with gr.Row():
|
| 160 |
+
refresh_btn = gr.Button("🔄 Refresh", size="sm", variant="secondary")
|
| 161 |
+
clear_btn = gr.Button("🗑️ Clear Database", size="sm", variant="stop")
|
| 162 |
+
|
| 163 |
+
clear_status = gr.Textbox(
|
| 164 |
+
label="🔧 Database Status",
|
| 165 |
+
interactive=False,
|
| 166 |
+
max_lines=3
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Event handlers for document management
|
| 170 |
+
def update_stats_display():
|
| 171 |
+
return get_database_stats()
|
| 172 |
+
|
| 173 |
+
upload_btn.click(
|
| 174 |
+
fn=upload_and_process_pdfs,
|
| 175 |
+
inputs=[file_upload],
|
| 176 |
+
outputs=[upload_status]
|
| 177 |
+
).then(
|
| 178 |
+
fn=update_stats_display,
|
| 179 |
+
outputs=[stats_display]
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
refresh_btn.click(
|
| 183 |
+
fn=update_stats_display,
|
| 184 |
+
outputs=[stats_display]
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
clear_btn.click(
|
| 188 |
+
fn=clear_database,
|
| 189 |
+
outputs=[clear_status]
|
| 190 |
+
).then(
|
| 191 |
+
fn=update_stats_display,
|
| 192 |
+
outputs=[stats_display]
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Tab 2: Chat Interface
|
| 196 |
+
with gr.Tab("💬 AI Assistant"):
|
| 197 |
+
|
| 198 |
+
gr.Markdown("## 🤖 Ask questions about your uploaded documents")
|
| 199 |
+
gr.Markdown("**💡 Tips:** Upload PDFs first, then ask specific questions about their content for detailed answers with source references.")
|
| 200 |
+
|
| 201 |
+
# Create chat interface with messages format
|
| 202 |
+
chatbot = gr.Chatbot(
|
| 203 |
+
height=500,
|
| 204 |
+
show_label=False,
|
| 205 |
+
type="messages",
|
| 206 |
+
value=[{
|
| 207 |
+
"role": "assistant",
|
| 208 |
+
"content": "👋 **Welcome to PDF RAG Assistant!**\n\nI'm here to help you analyze and understand your PDF documents. \n\n📋 **Getting started:**\n1. Upload PDFs in the 'Document Management' tab\n2. Come back here and ask me questions\n3. I'll provide detailed answers with source references\n\n🚀 **Ready to get started?**"
|
| 209 |
+
}]
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
with gr.Row():
|
| 213 |
+
msg_input = gr.Textbox(
|
| 214 |
+
placeholder="💭 Ask a question about your documents...",
|
| 215 |
+
label="Your Question",
|
| 216 |
+
lines=2,
|
| 217 |
+
scale=4
|
| 218 |
+
)
|
| 219 |
+
send_btn = gr.Button(
|
| 220 |
+
"📨 Send",
|
| 221 |
+
variant="primary",
|
| 222 |
+
size="lg",
|
| 223 |
+
scale=1
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
clear_chat_btn = gr.Button(
|
| 227 |
+
"🧹 Clear Chat",
|
| 228 |
+
variant="secondary",
|
| 229 |
+
size="sm"
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# Event handlers for chat
|
| 233 |
+
send_btn.click(
|
| 234 |
+
fn=respond,
|
| 235 |
+
inputs=[msg_input, chatbot],
|
| 236 |
+
outputs=[msg_input, chatbot]
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
msg_input.submit(
|
| 240 |
+
fn=respond,
|
| 241 |
+
inputs=[msg_input, chatbot],
|
| 242 |
+
outputs=[msg_input, chatbot]
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
clear_chat_btn.click(
|
| 246 |
+
fn=lambda: [{
|
| 247 |
+
"role": "assistant",
|
| 248 |
+
"content": "👋 **Welcome back!**\n\nI'm ready to help you with your PDF documents again. What would you like to know?"
|
| 249 |
+
}],
|
| 250 |
+
outputs=[chatbot]
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# Tab 3: System Information
|
| 254 |
+
with gr.Tab("ℹ️ System Information"):
|
| 255 |
+
|
| 256 |
+
gr.Markdown("# ⚙️ System Configuration & Information")
|
| 257 |
+
|
| 258 |
+
with gr.Row():
|
| 259 |
+
with gr.Column():
|
| 260 |
+
gr.Markdown("## 🔧 Current Settings")
|
| 261 |
+
|
| 262 |
+
settings_info = f"""
|
| 263 |
+
**🧠 Embedding Model:** `{Config.EMBEDDING_MODEL}`
|
| 264 |
+
|
| 265 |
+
**📝 Chunk Size:** {Config.CHUNK_SIZE} characters
|
| 266 |
+
|
| 267 |
+
**🔗 Chunk Overlap:** {Config.CHUNK_OVERLAP} characters
|
| 268 |
+
|
| 269 |
+
**🎯 Search Results:** Top {Config.TOP_K} most relevant chunks
|
| 270 |
+
|
| 271 |
+
**📁 Max File Size:** 16MB per PDF
|
| 272 |
+
"""
|
| 273 |
+
gr.Markdown(settings_info)
|
| 274 |
+
|
| 275 |
+
with gr.Column():
|
| 276 |
+
gr.Markdown("## 🚀 Key Features")
|
| 277 |
+
|
| 278 |
+
features_info = """
|
| 279 |
+
✅ Multiple PDF upload and processing
|
| 280 |
+
|
| 281 |
+
✅ Intelligent text chunking
|
| 282 |
+
|
| 283 |
+
✅ Vector similarity search using FAISS
|
| 284 |
+
|
| 285 |
+
✅ AI-powered Q&A with Google Gemini
|
| 286 |
+
|
| 287 |
+
✅ Source attribution with page numbers
|
| 288 |
+
|
| 289 |
+
✅ Persistent vector database storage
|
| 290 |
+
|
| 291 |
+
✅ Real-time chat interface
|
| 292 |
+
|
| 293 |
+
✅ Responsive modern UI
|
| 294 |
+
"""
|
| 295 |
+
gr.Markdown(features_info)
|
| 296 |
+
|
| 297 |
+
gr.Markdown("## 🛠️ Technology Stack")
|
| 298 |
+
|
| 299 |
+
with gr.Row():
|
| 300 |
+
with gr.Column():
|
| 301 |
+
gr.Markdown("**🖥️ Framework:** Gradio 4.44+")
|
| 302 |
+
gr.Markdown("**📄 PDF Processing:** PyMuPDF")
|
| 303 |
+
with gr.Column():
|
| 304 |
+
gr.Markdown("**🧮 Embeddings:** Sentence Transformers")
|
| 305 |
+
gr.Markdown("**🗃️ Vector Database:** FAISS")
|
| 306 |
+
with gr.Column():
|
| 307 |
+
gr.Markdown("**🤖 Language Model:** Google Gemini 1.5")
|
| 308 |
+
|
| 309 |
+
gr.Markdown("## 📝 Quick Start Guide")
|
| 310 |
+
|
| 311 |
+
guide_info = """
|
| 312 |
+
**1.** Upload Documents - Go to 'Document Management' tab and upload your PDF files
|
| 313 |
+
|
| 314 |
+
**2.** Process & Index - Wait for the system to extract text and create embeddings
|
| 315 |
+
|
| 316 |
+
**3.** Ask Questions - Switch to 'AI Assistant' tab and start asking questions
|
| 317 |
+
|
| 318 |
+
**4.** Get Intelligent Answers - Receive detailed responses with source references and page numbers
|
| 319 |
+
"""
|
| 320 |
+
gr.Markdown(guide_info)
|
| 321 |
+
|
| 322 |
+
return interface
|
| 323 |
+
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
# Create and launch the interface
|
| 326 |
+
interface = create_interface()
|
| 327 |
+
interface.launch(
|
| 328 |
+
server_name="0.0.0.0",
|
| 329 |
+
server_port=7860,
|
| 330 |
+
share=False,
|
| 331 |
+
show_error=True
|
| 332 |
+
)
|
config.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
class Config:
|
| 7 |
+
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY') #os.getenv('GEMINI_API_KEY')
|
| 8 |
+
|
| 9 |
+
SECRET_KEY = os.environ.get('SECRET_KEY', 'your-secret-key-here')
|
| 10 |
+
UPLOAD_FOLDER = 'uploads'
|
| 11 |
+
VECTOR_DB_PATH = 'vector_db'
|
| 12 |
+
MAX_CONTENT_LENGTH = 16 * 1024 * 1024
|
| 13 |
+
|
| 14 |
+
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
| 15 |
+
CHUNK_SIZE = 1000
|
| 16 |
+
CHUNK_OVERLAP = 200
|
| 17 |
+
|
| 18 |
+
TOP_K = 5
|
| 19 |
+
|
| 20 |
+
ALLOWED_EXTENSIONS = {'pdf'}
|
| 21 |
+
os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
|
| 22 |
+
os.makedirs(Config.VECTOR_DB_PATH, exist_ok=True)
|
pdf_processor.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class DocumentChunk:
|
| 9 |
+
content: str
|
| 10 |
+
metadata: Dict[str, Any]
|
| 11 |
+
page_number: int
|
| 12 |
+
source_file: str
|
| 13 |
+
|
| 14 |
+
class PDFProcessor:
|
| 15 |
+
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
|
| 16 |
+
self.chunk_size = chunk_size
|
| 17 |
+
self.chunk_overlap = chunk_overlap
|
| 18 |
+
|
| 19 |
+
def extract_text_from_pdf(self, pdf_path: str) -> List[DocumentChunk]:
|
| 20 |
+
"""Extract text from PDF and return chunks with metadata."""
|
| 21 |
+
chunks = []
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
doc = fitz.open(pdf_path)
|
| 25 |
+
filename = os.path.basename(pdf_path)
|
| 26 |
+
|
| 27 |
+
for page_num in range(len(doc)):
|
| 28 |
+
page = doc.load_page(page_num)
|
| 29 |
+
text = page.get_text()
|
| 30 |
+
|
| 31 |
+
if text.strip():
|
| 32 |
+
cleaned_text = self._clean_text(text)
|
| 33 |
+
page_chunks = self._create_chunks(cleaned_text, page_num + 1, filename)
|
| 34 |
+
chunks.extend(page_chunks)
|
| 35 |
+
|
| 36 |
+
doc.close()
|
| 37 |
+
return chunks
|
| 38 |
+
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise Exception(f"Error processing PDF {pdf_path}: {str(e)}")
|
| 41 |
+
|
| 42 |
+
def _clean_text(self, text: str) -> str:
|
| 43 |
+
"""Clean and normalize text."""
|
| 44 |
+
text = re.sub(r'\s+', ' ', text)
|
| 45 |
+
text = re.sub(r'[^\w\s.,!?;:()\[\]{}"-]', '', text)
|
| 46 |
+
text = re.sub(r'([.,!?;:]){2,}', r'\1', text)
|
| 47 |
+
|
| 48 |
+
return text.strip()
|
| 49 |
+
|
| 50 |
+
def _create_chunks(self, text: str, page_number: int, filename: str) -> List[DocumentChunk]:
|
| 51 |
+
"""Split text into overlapping chunks."""
|
| 52 |
+
chunks = []
|
| 53 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 54 |
+
current_chunk = ""
|
| 55 |
+
current_length = 0
|
| 56 |
+
|
| 57 |
+
for sentence in sentences:
|
| 58 |
+
sentence_length = len(sentence)
|
| 59 |
+
|
| 60 |
+
if current_length + sentence_length > self.chunk_size and current_chunk:
|
| 61 |
+
chunks.append(DocumentChunk(
|
| 62 |
+
content=current_chunk.strip(),
|
| 63 |
+
metadata={
|
| 64 |
+
'filename': filename,
|
| 65 |
+
'page_number': page_number,
|
| 66 |
+
'chunk_length': len(current_chunk)
|
| 67 |
+
},
|
| 68 |
+
page_number=page_number,
|
| 69 |
+
source_file=filename
|
| 70 |
+
))
|
| 71 |
+
|
| 72 |
+
overlap_text = self._get_overlap_text(current_chunk)
|
| 73 |
+
current_chunk = overlap_text + " " + sentence
|
| 74 |
+
current_length = len(current_chunk)
|
| 75 |
+
else:
|
| 76 |
+
current_chunk += " " + sentence if current_chunk else sentence
|
| 77 |
+
current_length = len(current_chunk)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
if current_chunk.strip():
|
| 81 |
+
chunks.append(DocumentChunk(
|
| 82 |
+
content=current_chunk.strip(),
|
| 83 |
+
metadata={
|
| 84 |
+
'filename': filename,
|
| 85 |
+
'page_number': page_number,
|
| 86 |
+
'chunk_length': len(current_chunk)
|
| 87 |
+
},
|
| 88 |
+
page_number=page_number,
|
| 89 |
+
source_file=filename
|
| 90 |
+
))
|
| 91 |
+
|
| 92 |
+
return chunks
|
| 93 |
+
|
| 94 |
+
def _get_overlap_text(self, text: str) -> str:
|
| 95 |
+
"""Get overlap text from the end of current chunk."""
|
| 96 |
+
if len(text) <= self.chunk_overlap:
|
| 97 |
+
return text
|
| 98 |
+
return text[-self.chunk_overlap:]
|
rag_engine.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
from typing import List, Dict, Any
|
| 3 |
+
from vector_store import VectorStore
|
| 4 |
+
from config import Config
|
| 5 |
+
|
| 6 |
+
class RAGEngine:
|
| 7 |
+
def __init__(self, vector_store: VectorStore):
|
| 8 |
+
self.vector_store = vector_store
|
| 9 |
+
|
| 10 |
+
genai.configure(api_key=Config.GEMINI_API_KEY) # type: ignore
|
| 11 |
+
self.model = genai.GenerativeModel('gemini-2.0-flash-lite') # type: ignore
|
| 12 |
+
|
| 13 |
+
def generate_answer(self, query: str, top_k: int = 5) -> Dict[str, Any]:
|
| 14 |
+
"""Generate answer using RAG pipeline."""
|
| 15 |
+
try:
|
| 16 |
+
|
| 17 |
+
search_results = self.vector_store.search(query, top_k)
|
| 18 |
+
|
| 19 |
+
if not search_results:
|
| 20 |
+
return {
|
| 21 |
+
'answer': "I couldn't find any relevant information in the uploaded documents to answer your question.",
|
| 22 |
+
'sources': [],
|
| 23 |
+
'context_used': ""
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
context_parts = []
|
| 28 |
+
sources = []
|
| 29 |
+
|
| 30 |
+
for i, result in enumerate(search_results):
|
| 31 |
+
context_parts.append(f"[Context {i+1}]: {result['content']}")
|
| 32 |
+
sources.append({
|
| 33 |
+
'source_file': result['source_file'],
|
| 34 |
+
'page_number': result['page_number'],
|
| 35 |
+
'similarity_score': result['similarity_score'],
|
| 36 |
+
'content_preview': result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
|
| 37 |
+
})
|
| 38 |
+
|
| 39 |
+
context = "\n\n".join(context_parts)
|
| 40 |
+
prompt = self._create_prompt(query, context)
|
| 41 |
+
response = self.model.generate_content(prompt)
|
| 42 |
+
|
| 43 |
+
return {
|
| 44 |
+
'answer': response.text,
|
| 45 |
+
'sources': sources,
|
| 46 |
+
'context_used': context,
|
| 47 |
+
'query': query
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
except Exception as e:
|
| 51 |
+
return {
|
| 52 |
+
'answer': f"An error occurred while generating the answer: {str(e)}",
|
| 53 |
+
'sources': [],
|
| 54 |
+
'context_used': "",
|
| 55 |
+
'error': str(e)
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
def _create_prompt(self, query: str, context: str) -> str:
|
| 59 |
+
"""Create a prompt for the language model."""
|
| 60 |
+
prompt = f"""You are an AI assistant that answers questions based on provided document context.
|
| 61 |
+
|
| 62 |
+
Instructions:
|
| 63 |
+
1. Answer the question using ONLY the information provided in the context below
|
| 64 |
+
2. If the context doesn't contain enough information to answer the question, say so clearly
|
| 65 |
+
3. Be concise but comprehensive in your answer
|
| 66 |
+
4. If you reference specific information, mention which context section it comes from
|
| 67 |
+
5. Do not make up information that's not in the provided context
|
| 68 |
+
|
| 69 |
+
Context from documents:
|
| 70 |
+
{context}
|
| 71 |
+
|
| 72 |
+
Question: {query}
|
| 73 |
+
|
| 74 |
+
Answer:"""
|
| 75 |
+
|
| 76 |
+
return prompt
|
| 77 |
+
|
| 78 |
+
def get_conversation_response(self, query: str) -> str:
|
| 79 |
+
"""Get a simple text response for conversation interface."""
|
| 80 |
+
result = self.generate_answer(query)
|
| 81 |
+
|
| 82 |
+
answer = result['answer']
|
| 83 |
+
sources = result.get('sources', [])
|
| 84 |
+
|
| 85 |
+
if sources:
|
| 86 |
+
answer += "\n\n**Sources:**\n"
|
| 87 |
+
for i, source in enumerate(sources[:3], 1):
|
| 88 |
+
answer += f"{i}. {source['source_file']} (Page {source['page_number']})\n"
|
| 89 |
+
|
| 90 |
+
return answer
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
PyMuPDF
|
| 3 |
+
sentence-transformers
|
| 4 |
+
faiss-cpu
|
| 5 |
+
google-generativeai
|
| 6 |
+
python-dotenv
|
| 7 |
+
numpy
|
vector_db/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
vector_store.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pickle
|
| 4 |
+
import os
|
| 5 |
+
from typing import List, Dict, Any, Tuple
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from pdf_processor import DocumentChunk
|
| 8 |
+
|
| 9 |
+
class VectorStore:
|
| 10 |
+
def __init__(self, model_name: str, vector_db_path: str):
|
| 11 |
+
self.model = SentenceTransformer(model_name)
|
| 12 |
+
self.vector_db_path = vector_db_path
|
| 13 |
+
self.index_path = os.path.join(vector_db_path, 'faiss_index.bin')
|
| 14 |
+
self.metadata_path = os.path.join(vector_db_path, 'metadata.pkl')
|
| 15 |
+
|
| 16 |
+
self.index = None
|
| 17 |
+
self.metadata = []
|
| 18 |
+
self.load_index()
|
| 19 |
+
|
| 20 |
+
def load_index(self):
|
| 21 |
+
"""Load existing FAISS index and metadata."""
|
| 22 |
+
try:
|
| 23 |
+
if os.path.exists(self.index_path) and os.path.exists(self.metadata_path):
|
| 24 |
+
self.index = faiss.read_index(self.index_path)
|
| 25 |
+
with open(self.metadata_path, 'rb') as f:
|
| 26 |
+
self.metadata = pickle.load(f)
|
| 27 |
+
|
| 28 |
+
print(f"Loaded existing index with {len(self.metadata)} documents")
|
| 29 |
+
else:
|
| 30 |
+
print("No existing index found. Will create new one.")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error loading index: {e}")
|
| 33 |
+
self.index = None
|
| 34 |
+
self.metadata = []
|
| 35 |
+
|
| 36 |
+
def add_documents(self, chunks: List[DocumentChunk]):
|
| 37 |
+
"""Add document chunks to the vector store."""
|
| 38 |
+
if not chunks:
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
texts = [chunk.content for chunk in chunks]
|
| 42 |
+
embeddings = self.model.encode(texts, convert_to_tensor=False)
|
| 43 |
+
embeddings = np.array(embeddings).astype('float32')
|
| 44 |
+
|
| 45 |
+
if self.index is None:
|
| 46 |
+
dimension = embeddings.shape[1]
|
| 47 |
+
self.index = faiss.IndexFlatIP(dimension)
|
| 48 |
+
faiss.normalize_L2(embeddings)
|
| 49 |
+
|
| 50 |
+
self.index.add(embeddings) # type: ignore
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
for chunk in chunks:
|
| 54 |
+
self.metadata.append({
|
| 55 |
+
'content': chunk.content,
|
| 56 |
+
'metadata': chunk.metadata,
|
| 57 |
+
'page_number': chunk.page_number,
|
| 58 |
+
'source_file': chunk.source_file
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
self.save_index()
|
| 63 |
+
print(f"Added {len(chunks)} chunks to vector store")
|
| 64 |
+
|
| 65 |
+
def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 66 |
+
"""Search for similar documents."""
|
| 67 |
+
if self.index is None or len(self.metadata) == 0:
|
| 68 |
+
return []
|
| 69 |
+
|
| 70 |
+
query_embedding = self.model.encode([query], convert_to_tensor=False)
|
| 71 |
+
query_embedding = np.array(query_embedding).astype('float32')
|
| 72 |
+
faiss.normalize_L2(query_embedding)
|
| 73 |
+
scores, indices = self.index.search(query_embedding, min(top_k, len(self.metadata))) # type: ignore
|
| 74 |
+
results = []
|
| 75 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 76 |
+
if idx != -1:
|
| 77 |
+
result = self.metadata[idx].copy()
|
| 78 |
+
result['similarity_score'] = float(score)
|
| 79 |
+
results.append(result)
|
| 80 |
+
|
| 81 |
+
return results
|
| 82 |
+
|
| 83 |
+
def save_index(self):
|
| 84 |
+
"""Save FAISS index and metadata to disk."""
|
| 85 |
+
try:
|
| 86 |
+
if self.index is not None:
|
| 87 |
+
faiss.write_index(self.index, self.index_path)
|
| 88 |
+
|
| 89 |
+
with open(self.metadata_path, 'wb') as f:
|
| 90 |
+
pickle.dump(self.metadata, f)
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"Error saving index: {e}")
|
| 94 |
+
|
| 95 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 96 |
+
"""Get statistics about the vector store."""
|
| 97 |
+
if self.index is None:
|
| 98 |
+
return {'total_documents': 0, 'index_size': 0}
|
| 99 |
+
|
| 100 |
+
return {
|
| 101 |
+
'total_documents': len(self.metadata),
|
| 102 |
+
'index_size': self.index.ntotal,
|
| 103 |
+
'dimension': self.index.d
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
def clear_index(self):
|
| 107 |
+
"""Clear the entire index."""
|
| 108 |
+
self.index = None
|
| 109 |
+
self.metadata = []
|
| 110 |
+
if os.path.exists(self.index_path):
|
| 111 |
+
os.remove(self.index_path)
|
| 112 |
+
if os.path.exists(self.metadata_path):
|
| 113 |
+
os.remove(self.metadata_path)
|
| 114 |
+
|
| 115 |
+
print("Index cleared successfully")
|