Update src/app.py
Browse files- src/app.py +327 -195
src/app.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import os
|
|
|
|
| 3 |
from pathlib import Path
|
|
|
|
| 4 |
from pdf_parser import PDFParser
|
| 5 |
from embedder import ChromaDBManager
|
| 6 |
from rag_pipeline import RAGPipeline
|
| 7 |
-
import torch
|
| 8 |
|
| 9 |
|
| 10 |
# ============================================================================
|
|
@@ -27,16 +28,25 @@ st.markdown("""
|
|
| 27 |
.main {
|
| 28 |
padding: 2rem;
|
| 29 |
}
|
| 30 |
-
.
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
border-radius: 4px;
|
| 34 |
padding: 10px;
|
| 35 |
margin: 10px 0;
|
| 36 |
}
|
| 37 |
-
.
|
| 38 |
-
background-color: #
|
| 39 |
-
border: 1px solid #
|
| 40 |
border-radius: 4px;
|
| 41 |
padding: 10px;
|
| 42 |
margin: 10px 0;
|
|
@@ -61,6 +71,7 @@ def initialize_system():
|
|
| 61 |
st.error(f"Error initializing system: {e}")
|
| 62 |
return None, None, None, None
|
| 63 |
|
|
|
|
| 64 |
# Initialize
|
| 65 |
pdf_parser, chroma_manager, rag_pipeline, device = initialize_system()
|
| 66 |
|
|
@@ -68,214 +79,233 @@ if pdf_parser is None:
|
|
| 68 |
st.error("Failed to initialize RAG system. Please check your installation.")
|
| 69 |
st.stop()
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# ============================================================================
|
| 72 |
# MAIN UI
|
| 73 |
# ============================================================================
|
| 74 |
|
| 75 |
-
st.title("π Multimodal PDF RAG System
|
| 76 |
st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**")
|
| 77 |
-
st.markdown("*
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
|
| 81 |
-
st.header("βοΈ Configuration")
|
| 82 |
-
|
| 83 |
-
# PDF directory
|
| 84 |
-
pdf_dir = st.text_input(
|
| 85 |
-
"PDF Directory Path",
|
| 86 |
-
value="./pdf_documents",
|
| 87 |
-
help="Directory containing PDF files to process"
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
# Create directory if it doesn't exist
|
| 91 |
-
os.makedirs(pdf_dir, exist_ok=True)
|
| 92 |
-
|
| 93 |
-
st.divider()
|
| 94 |
-
|
| 95 |
-
# Load/Refresh documents
|
| 96 |
-
col1, col2 = st.columns(2)
|
| 97 |
-
with col1:
|
| 98 |
-
if st.button("π Load PDFs", use_container_width=True):
|
| 99 |
-
with st.spinner("Processing PDFs..."):
|
| 100 |
-
try:
|
| 101 |
-
documents = pdf_parser.process_pdf_directory(pdf_dir)
|
| 102 |
-
|
| 103 |
-
if documents:
|
| 104 |
-
chroma_manager.add_documents(documents)
|
| 105 |
-
st.success(f"β
Loaded {len(documents)} documents!")
|
| 106 |
-
else:
|
| 107 |
-
st.warning("β οΈ No PDFs found in directory")
|
| 108 |
-
except Exception as e:
|
| 109 |
-
st.error(f"β Error loading PDFs: {e}")
|
| 110 |
-
|
| 111 |
-
with col2:
|
| 112 |
-
if st.button("π Refresh", use_container_width=True):
|
| 113 |
-
st.rerun()
|
| 114 |
-
|
| 115 |
-
st.divider()
|
| 116 |
-
|
| 117 |
-
# Statistics
|
| 118 |
-
st.subheader("π Statistics")
|
| 119 |
-
try:
|
| 120 |
-
collection_info = chroma_manager.get_collection_info()
|
| 121 |
-
st.metric("Documents in DB", collection_info['document_count'])
|
| 122 |
-
except Exception as e:
|
| 123 |
-
st.warning(f"Could not load statistics: {e}")
|
| 124 |
-
|
| 125 |
-
st.divider()
|
| 126 |
-
|
| 127 |
-
# Device info
|
| 128 |
-
device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
|
| 129 |
-
st.info(f"Running on: {device_name}")
|
| 130 |
-
|
| 131 |
-
# Main content with tabs
|
| 132 |
-
tab1, tab2, tab3, tab4 = st.tabs(["π Ask Question", "π Document Summary", "βΉοΈ About", "π οΈ Database"])
|
| 133 |
|
| 134 |
# ============================================================================
|
| 135 |
-
# TAB 1:
|
| 136 |
# ============================================================================
|
| 137 |
|
| 138 |
-
with
|
| 139 |
-
st.header("
|
| 140 |
|
| 141 |
col1, col2 = st.columns([3, 1])
|
| 142 |
|
| 143 |
with col1:
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
)
|
| 149 |
|
| 150 |
with col2:
|
| 151 |
-
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
collection_info = chroma_manager.get_collection_info()
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
# ============================================================================
|
| 221 |
-
# TAB
|
| 222 |
# ============================================================================
|
| 223 |
|
| 224 |
-
with
|
| 225 |
-
st.header("
|
| 226 |
-
|
| 227 |
-
st.markdown("""
|
| 228 |
-
### Overview
|
| 229 |
-
This is an **improved Local Multimodal RAG System** with enhanced error handling and token management.
|
| 230 |
-
|
| 231 |
-
### Key Improvements (Fixed Version)
|
| 232 |
-
β
**Token Management**: Automatic context truncation to prevent model errors
|
| 233 |
-
β
**Error Handling**: Comprehensive try-catch blocks throughout
|
| 234 |
-
β
**Image Extraction**: Fixed PyMuPDF xref handling
|
| 235 |
-
β
**Better Limits**: Resource limits on text, tables, and images
|
| 236 |
-
β
**Performance**: Optimized for large PDFs (400+ pages)
|
| 237 |
-
β
**Robustness**: Graceful degradation on errors
|
| 238 |
-
|
| 239 |
-
### Core Features
|
| 240 |
-
- **π PDF Processing**: Text, tables, and images extraction
|
| 241 |
-
- **π Vector Search**: ChromaDB with CLIP embeddings
|
| 242 |
-
- **π€ AI Generation**: Qwen2.5-VL-3B model
|
| 243 |
-
- **π Russian Support**: Full support for Russian language
|
| 244 |
-
- **πΎ Persistent Storage**: Local ChromaDB database
|
| 245 |
-
- **β‘ Lightweight**: Runs on consumer hardware
|
| 246 |
-
|
| 247 |
-
### Technology Stack
|
| 248 |
-
- **LLM Model**: Qwen2.5-VL-3B-Instruct
|
| 249 |
-
- **Embeddings**: CLIP (clip-vit-base-patch32)
|
| 250 |
-
- **Vector DB**: ChromaDB with persistent storage
|
| 251 |
-
- **UI**: Streamlit
|
| 252 |
-
- **PDF Tools**: pdfplumber + PyMuPDF
|
| 253 |
-
|
| 254 |
-
### System Requirements
|
| 255 |
-
- Python 3.9+
|
| 256 |
-
- RAM: 8GB minimum (12GB+ recommended)
|
| 257 |
-
- Storage: 15GB for models
|
| 258 |
-
- GPU optional (CUDA for faster inference)
|
| 259 |
|
| 260 |
-
|
| 261 |
-
- Model Load: ~30 seconds
|
| 262 |
-
- Query Response (CPU): 20-60 seconds
|
| 263 |
-
- Query Response (GPU): 5-15 seconds
|
| 264 |
-
- PDF Processing: 1-2 seconds per page
|
| 265 |
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
# ============================================================================
|
| 275 |
-
# TAB
|
| 276 |
# ============================================================================
|
| 277 |
|
| 278 |
-
with
|
| 279 |
st.header("π οΈ Database Management")
|
| 280 |
|
| 281 |
col1, col2, col3 = st.columns(3)
|
|
@@ -294,10 +324,17 @@ with tab4:
|
|
| 294 |
all_docs = chroma_manager.collection.get(include=['documents'])
|
| 295 |
if all_docs['ids']:
|
| 296 |
st.write(f"Total documents: {len(all_docs['ids'])}")
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
else:
|
| 302 |
st.info("No documents in database")
|
| 303 |
except Exception as e:
|
|
@@ -318,14 +355,109 @@ with tab4:
|
|
| 318 |
|
| 319 |
st.divider()
|
| 320 |
|
| 321 |
-
st.markdown("###
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
# ============================================================================
|
| 331 |
# FOOTER
|
|
@@ -334,6 +466,6 @@ with tab4:
|
|
| 334 |
st.divider()
|
| 335 |
st.markdown("""
|
| 336 |
<div style='text-align: center; color: #666; font-size: 0.9rem;'>
|
| 337 |
-
Multimodal RAG System
|
| 338 |
</div>
|
| 339 |
""", unsafe_allow_html=True)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import os
|
| 3 |
+
import tempfile
|
| 4 |
from pathlib import Path
|
| 5 |
+
import torch
|
| 6 |
from pdf_parser import PDFParser
|
| 7 |
from embedder import ChromaDBManager
|
| 8 |
from rag_pipeline import RAGPipeline
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
# ============================================================================
|
|
|
|
| 28 |
.main {
|
| 29 |
padding: 2rem;
|
| 30 |
}
|
| 31 |
+
.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
|
| 32 |
+
font-size: 1.2rem;
|
| 33 |
+
}
|
| 34 |
+
.upload-area {
|
| 35 |
+
border: 2px dashed #ccc;
|
| 36 |
+
border-radius: 5px;
|
| 37 |
+
padding: 20px;
|
| 38 |
+
text-align: center;
|
| 39 |
+
}
|
| 40 |
+
.success-box {
|
| 41 |
+
background-color: #d4edda;
|
| 42 |
+
border: 1px solid #28a745;
|
| 43 |
border-radius: 4px;
|
| 44 |
padding: 10px;
|
| 45 |
margin: 10px 0;
|
| 46 |
}
|
| 47 |
+
.error-box {
|
| 48 |
+
background-color: #f8d7da;
|
| 49 |
+
border: 1px solid #f5c6cb;
|
| 50 |
border-radius: 4px;
|
| 51 |
padding: 10px;
|
| 52 |
margin: 10px 0;
|
|
|
|
| 71 |
st.error(f"Error initializing system: {e}")
|
| 72 |
return None, None, None, None
|
| 73 |
|
| 74 |
+
|
| 75 |
# Initialize
|
| 76 |
pdf_parser, chroma_manager, rag_pipeline, device = initialize_system()
|
| 77 |
|
|
|
|
| 79 |
st.error("Failed to initialize RAG system. Please check your installation.")
|
| 80 |
st.stop()
|
| 81 |
|
| 82 |
+
# Initialize session state for uploaded files
|
| 83 |
+
if 'uploaded_files' not in st.session_state:
|
| 84 |
+
st.session_state.uploaded_files = []
|
| 85 |
+
|
| 86 |
+
if 'processing_status' not in st.session_state:
|
| 87 |
+
st.session_state.processing_status = {}
|
| 88 |
+
|
| 89 |
# ============================================================================
|
| 90 |
# MAIN UI
|
| 91 |
# ============================================================================
|
| 92 |
|
| 93 |
+
st.title("π Multimodal PDF RAG System")
|
| 94 |
st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**")
|
| 95 |
+
st.markdown("*Upload PDFs directly and ask questions about them*")
|
| 96 |
|
| 97 |
+
# Create main tabs
|
| 98 |
+
tab_upload, tab_query, tab_manage, tab_about = st.tabs(["π€ Upload PDFs", "π Ask Questions", "π οΈ Manage", "βΉοΈ About"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# ============================================================================
|
| 101 |
+
# TAB 1: UPLOAD PDFs
|
| 102 |
# ============================================================================
|
| 103 |
|
| 104 |
+
with tab_upload:
|
| 105 |
+
st.header("π€ Upload PDF Documents")
|
| 106 |
|
| 107 |
col1, col2 = st.columns([3, 1])
|
| 108 |
|
| 109 |
with col1:
|
| 110 |
+
st.markdown("**Upload your PDF files below. They will be automatically processed and stored.**")
|
| 111 |
+
|
| 112 |
+
# File uploader
|
| 113 |
+
uploaded_files = st.file_uploader(
|
| 114 |
+
"Choose PDF files",
|
| 115 |
+
type=["pdf"],
|
| 116 |
+
accept_multiple_files=True,
|
| 117 |
+
help="You can upload multiple PDF files at once"
|
| 118 |
)
|
| 119 |
|
| 120 |
with col2:
|
| 121 |
+
st.info(f"π Documents in DB: {chroma_manager.get_collection_info()['document_count']}")
|
| 122 |
|
| 123 |
+
# Process uploaded files
|
| 124 |
+
if uploaded_files:
|
| 125 |
+
st.divider()
|
| 126 |
+
st.subheader("Processing Uploaded Files")
|
| 127 |
+
|
| 128 |
+
# Create a temporary directory for uploads
|
| 129 |
+
temp_dir = tempfile.mkdtemp()
|
| 130 |
+
|
| 131 |
+
progress_bar = st.progress(0)
|
| 132 |
+
status_text = st.empty()
|
| 133 |
+
results_container = st.container()
|
| 134 |
+
|
| 135 |
+
total_files = len(uploaded_files)
|
| 136 |
+
processed_files = []
|
| 137 |
+
failed_files = []
|
| 138 |
+
|
| 139 |
+
for idx, uploaded_file in enumerate(uploaded_files):
|
| 140 |
+
try:
|
| 141 |
+
# Update progress
|
| 142 |
+
status_text.text(f"Processing {idx + 1}/{total_files}: {uploaded_file.name}")
|
| 143 |
+
|
| 144 |
+
# Save uploaded file to temp directory
|
| 145 |
+
temp_file_path = os.path.join(temp_dir, uploaded_file.name)
|
| 146 |
+
with open(temp_file_path, "wb") as f:
|
| 147 |
+
f.write(uploaded_file.getbuffer())
|
| 148 |
+
|
| 149 |
+
# Process PDF
|
| 150 |
+
with st.spinner(f"Extracting content from {uploaded_file.name}..."):
|
| 151 |
+
try:
|
| 152 |
+
result = pdf_parser.process_pdf(temp_file_path)
|
| 153 |
+
|
| 154 |
+
# Add to ChromaDB
|
| 155 |
+
chroma_manager.add_documents([result])
|
| 156 |
+
|
| 157 |
+
processed_files.append({
|
| 158 |
+
'name': uploaded_file.name,
|
| 159 |
+
'size': uploaded_file.size,
|
| 160 |
+
'text_length': len(result.get('text', '')),
|
| 161 |
+
'tables': len(result.get('tables', [])),
|
| 162 |
+
'images': len(result.get('images', []))
|
| 163 |
+
})
|
| 164 |
+
|
| 165 |
+
st.success(f"β
{uploaded_file.name} processed successfully")
|
| 166 |
|
| 167 |
+
except Exception as e:
|
| 168 |
+
failed_files.append({
|
| 169 |
+
'name': uploaded_file.name,
|
| 170 |
+
'error': str(e)
|
| 171 |
+
})
|
| 172 |
+
st.error(f"β Error processing {uploaded_file.name}: {e}")
|
| 173 |
+
|
| 174 |
+
# Update progress
|
| 175 |
+
progress_bar.progress((idx + 1) / total_files)
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
failed_files.append({
|
| 179 |
+
'name': uploaded_file.name,
|
| 180 |
+
'error': str(e)
|
| 181 |
+
})
|
| 182 |
+
st.error(f"β Error with {uploaded_file.name}: {e}")
|
| 183 |
|
| 184 |
+
# Show summary
|
| 185 |
+
st.divider()
|
| 186 |
+
st.subheader("Upload Summary")
|
| 187 |
+
|
| 188 |
+
col1, col2, col3 = st.columns(3)
|
| 189 |
+
|
| 190 |
+
with col1:
|
| 191 |
+
st.metric("Successfully Processed", len(processed_files))
|
| 192 |
+
|
| 193 |
+
with col2:
|
| 194 |
+
st.metric("Failed", len(failed_files))
|
| 195 |
+
|
| 196 |
+
with col3:
|
| 197 |
collection_info = chroma_manager.get_collection_info()
|
| 198 |
+
st.metric("Total in Database", collection_info['document_count'])
|
| 199 |
+
|
| 200 |
+
# Show details of processed files
|
| 201 |
+
if processed_files:
|
| 202 |
+
st.markdown("#### β
Processed Files:")
|
| 203 |
+
for file_info in processed_files:
|
| 204 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 205 |
+
with col1:
|
| 206 |
+
st.text(file_info['name'])
|
| 207 |
+
with col2:
|
| 208 |
+
st.text(f"{file_info['size'] / 1024:.1f} KB")
|
| 209 |
+
with col3:
|
| 210 |
+
st.text(f"{file_info['text_length']:,} chars")
|
| 211 |
+
with col4:
|
| 212 |
+
st.text(f"{file_info['tables']} tables, {file_info['images']} imgs")
|
| 213 |
+
|
| 214 |
+
# Show failed files
|
| 215 |
+
if failed_files:
|
| 216 |
+
st.markdown("#### β Failed Files:")
|
| 217 |
+
for file_info in failed_files:
|
| 218 |
+
st.error(f"**{file_info['name']}**: {file_info['error']}")
|
| 219 |
|
| 220 |
# ============================================================================
|
| 221 |
+
# TAB 2: ASK QUESTIONS
|
| 222 |
# ============================================================================
|
| 223 |
|
| 224 |
+
with tab_query:
|
| 225 |
+
st.header("π Ask Questions About Your Documents")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
collection_info = chroma_manager.get_collection_info()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
+
if collection_info['document_count'] == 0:
|
| 230 |
+
st.warning("β οΈ No documents uploaded yet. Please upload PDFs in the 'Upload PDFs' tab first.")
|
| 231 |
+
else:
|
| 232 |
+
st.success(f"β
{collection_info['document_count']} documents in database")
|
| 233 |
+
|
| 234 |
+
col1, col2, col3 = st.columns([2, 1, 1])
|
| 235 |
+
|
| 236 |
+
with col1:
|
| 237 |
+
query = st.text_input(
|
| 238 |
+
"Enter your question:",
|
| 239 |
+
placeholder="ΠΠ°ΠΏΡΠΈΠΌΠ΅Ρ: ΠΠ°ΠΊΠΈΠ΅ ΠΊΠ»ΡΡΠ΅Π²ΡΠ΅ ΠΌΠΎΠΌΠ΅Π½ΡΡ ΠΎΠΏΠΈΡΠ°Π½Ρ Π² Π΄ΠΎΠΊΡΠΌΠ΅Π½ΡΠ΅?",
|
| 240 |
+
help="Ask any question about your uploaded documents"
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
with col2:
|
| 244 |
+
n_docs = st.number_input("Retrieved docs:", value=3, min_value=1, max_value=10)
|
| 245 |
+
|
| 246 |
+
with col3:
|
| 247 |
+
max_tokens = st.number_input("Max tokens:", value=256, min_value=128, max_value=512, step=128)
|
| 248 |
+
|
| 249 |
+
if st.button("π Get Answer", use_container_width=True, type="primary"):
|
| 250 |
+
if not query:
|
| 251 |
+
st.warning("β οΈ Please enter a question.")
|
| 252 |
+
else:
|
| 253 |
+
try:
|
| 254 |
+
with st.spinner("π€ Generating answer... (this may take 10-30 seconds)"):
|
| 255 |
+
st.info("Processing query - please wait...")
|
| 256 |
+
|
| 257 |
+
# Generate answer with error handling
|
| 258 |
+
try:
|
| 259 |
+
result = rag_pipeline.answer_question(
|
| 260 |
+
query=query,
|
| 261 |
+
n_retrieved=n_docs,
|
| 262 |
+
max_new_tokens=max_tokens
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
# Check for errors
|
| 266 |
+
if "error" in result and result["error"]:
|
| 267 |
+
st.error(f"β οΈ {result['error']}")
|
| 268 |
+
|
| 269 |
+
# Display answer
|
| 270 |
+
st.success("β
Answer Generated")
|
| 271 |
+
st.markdown("### Answer")
|
| 272 |
+
st.write(result['answer'])
|
| 273 |
+
|
| 274 |
+
# Display retrieved documents
|
| 275 |
+
with st.expander("π Retrieved Documents", expanded=False):
|
| 276 |
+
st.markdown(f"#### {result['doc_count']} Relevant Document Chunks:")
|
| 277 |
+
for idx, doc in enumerate(result['retrieved_docs'], 1):
|
| 278 |
+
with st.container():
|
| 279 |
+
col_rel, col_score = st.columns([3, 1])
|
| 280 |
+
with col_rel:
|
| 281 |
+
st.markdown(f"**Document {idx}**")
|
| 282 |
+
with col_score:
|
| 283 |
+
st.caption(f"Score: {doc['relevance_score']:.1%}")
|
| 284 |
+
|
| 285 |
+
# Truncate for display
|
| 286 |
+
preview = doc['document'][:400]
|
| 287 |
+
if len(doc['document']) > 400:
|
| 288 |
+
preview += "..."
|
| 289 |
+
st.write(preview)
|
| 290 |
+
|
| 291 |
+
if doc['metadata']:
|
| 292 |
+
st.caption(f"Source: {doc['metadata'].get('filename', 'Unknown')}")
|
| 293 |
+
|
| 294 |
+
except Exception as e:
|
| 295 |
+
st.error(f"β Error during generation: {e}")
|
| 296 |
+
st.info("Possible causes:")
|
| 297 |
+
st.write("- Out of memory (try reducing 'Max tokens' or 'Retrieved docs')")
|
| 298 |
+
st.write("- Model inference timeout")
|
| 299 |
+
st.write("- Invalid input format")
|
| 300 |
+
|
| 301 |
+
except Exception as e:
|
| 302 |
+
st.error(f"β Unexpected error: {e}")
|
| 303 |
|
| 304 |
# ============================================================================
|
| 305 |
+
# TAB 3: MANAGE DATABASE
|
| 306 |
# ============================================================================
|
| 307 |
|
| 308 |
+
with tab_manage:
|
| 309 |
st.header("π οΈ Database Management")
|
| 310 |
|
| 311 |
col1, col2, col3 = st.columns(3)
|
|
|
|
| 324 |
all_docs = chroma_manager.collection.get(include=['documents'])
|
| 325 |
if all_docs['ids']:
|
| 326 |
st.write(f"Total documents: {len(all_docs['ids'])}")
|
| 327 |
+
|
| 328 |
+
col1_list, col2_list = st.columns(2)
|
| 329 |
+
|
| 330 |
+
with col1_list:
|
| 331 |
+
st.write("**First 10:**")
|
| 332 |
+
for idx, doc_id in enumerate(all_docs['ids'][:10], 1):
|
| 333 |
+
st.write(f"{idx}. {doc_id[:50]}...")
|
| 334 |
+
|
| 335 |
+
with col2_list:
|
| 336 |
+
if len(all_docs['ids']) > 10:
|
| 337 |
+
st.write(f"**... and {len(all_docs['ids']) - 10} more**")
|
| 338 |
else:
|
| 339 |
st.info("No documents in database")
|
| 340 |
except Exception as e:
|
|
|
|
| 355 |
|
| 356 |
st.divider()
|
| 357 |
|
| 358 |
+
st.markdown("### Storage Information")
|
| 359 |
+
col1, col2 = st.columns(2)
|
| 360 |
+
|
| 361 |
+
with col1:
|
| 362 |
+
extraction_size = sum(
|
| 363 |
+
os.path.getsize(os.path.join(dirpath, filename))
|
| 364 |
+
for dirpath, dirnames, filenames in os.walk("./pdf_extractions")
|
| 365 |
+
for filename in filenames
|
| 366 |
+
) / (1024 * 1024) if os.path.exists("./pdf_extractions") else 0
|
| 367 |
+
st.metric("PDF Extractions", f"{extraction_size:.1f} MB")
|
| 368 |
+
|
| 369 |
+
with col2:
|
| 370 |
+
chroma_size = sum(
|
| 371 |
+
os.path.getsize(os.path.join(dirpath, filename))
|
| 372 |
+
for dirpath, dirnames, filenames in os.walk("./chroma_db")
|
| 373 |
+
for filename in filenames
|
| 374 |
+
) / (1024 * 1024) if os.path.exists("./chroma_db") else 0
|
| 375 |
+
st.metric("ChromaDB Storage", f"{chroma_size:.1f} MB")
|
| 376 |
+
|
| 377 |
+
# ============================================================================
|
| 378 |
+
# TAB 4: ABOUT
|
| 379 |
+
# ============================================================================
|
| 380 |
+
|
| 381 |
+
with tab_about:
|
| 382 |
+
st.header("βΉοΈ About This System")
|
| 383 |
+
|
| 384 |
+
st.markdown("""
|
| 385 |
+
### Multimodal RAG System with PDF Upload
|
| 386 |
+
|
| 387 |
+
This is a **local, privacy-first AI document analysis system** that allows you to:
|
| 388 |
+
|
| 389 |
+
#### β¨ Features
|
| 390 |
+
- **π€ Easy PDF Upload**: Drag & drop or select multiple PDF files
|
| 391 |
+
- **π Smart Search**: Semantic search across documents with CLIP embeddings
|
| 392 |
+
- **π€ AI-Powered Answers**: Ask questions and get answers from Qwen2.5-VL-3B
|
| 393 |
+
- **π Russian & English**: Full support for both languages
|
| 394 |
+
- **πΎ Local Storage**: All data stays on your machine
|
| 395 |
+
- **β‘ Fast Processing**: Automatic caching to avoid re-processing
|
| 396 |
+
|
| 397 |
+
#### ποΈ How It Works
|
| 398 |
+
1. Upload PDF documents
|
| 399 |
+
2. System extracts text, tables, and images
|
| 400 |
+
3. Content is embedded with CLIP and stored in ChromaDB
|
| 401 |
+
4. Ask questions about your documents
|
| 402 |
+
5. AI retrieves relevant sections and generates answers
|
| 403 |
+
|
| 404 |
+
#### π Privacy & Security
|
| 405 |
+
- β
All processing happens locally
|
| 406 |
+
- β
No internet required (after model download)
|
| 407 |
+
- β
No cloud APIs used
|
| 408 |
+
- β
Full data control
|
| 409 |
+
- β
Open-source code
|
| 410 |
+
|
| 411 |
+
#### π» Technology Stack
|
| 412 |
+
- **LLM**: Qwen2.5-VL-3B (multimodal)
|
| 413 |
+
- **Embeddings**: CLIP (clip-vit-base-patch32)
|
| 414 |
+
- **Vector DB**: ChromaDB
|
| 415 |
+
- **UI**: Streamlit
|
| 416 |
+
- **PDF Processing**: pdfplumber + PyMuPDF
|
| 417 |
+
|
| 418 |
+
#### π System Info
|
| 419 |
+
""")
|
| 420 |
+
|
| 421 |
+
col1, col2, col3 = st.columns(3)
|
| 422 |
+
|
| 423 |
+
with col1:
|
| 424 |
+
device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
|
| 425 |
+
st.metric("Device", device_name)
|
| 426 |
+
|
| 427 |
+
with col2:
|
| 428 |
+
collection_info = chroma_manager.get_collection_info()
|
| 429 |
+
st.metric("Documents in DB", collection_info['document_count'])
|
| 430 |
+
|
| 431 |
+
with col3:
|
| 432 |
+
st.metric("Version", "1.2 (Upload)")
|
| 433 |
+
|
| 434 |
+
st.divider()
|
| 435 |
+
|
| 436 |
+
st.markdown("""
|
| 437 |
+
#### π How to Use
|
| 438 |
|
| 439 |
+
1. **Upload PDFs**: Go to the "Upload PDFs" tab and select your files
|
| 440 |
+
2. **Wait for Processing**: System automatically extracts content
|
| 441 |
+
3. **Ask Questions**: Switch to "Ask Questions" tab and type your query
|
| 442 |
+
4. **Review Results**: See generated answers and relevant document chunks
|
| 443 |
+
5. **Manage**: Use "Manage" tab to view or clear database
|
| 444 |
|
| 445 |
+
#### βοΈ Tips for Best Results
|
| 446 |
+
- Start with smaller PDFs to test
|
| 447 |
+
- Ask specific questions for better answers
|
| 448 |
+
- Reduce "Retrieved docs" if responses are slow
|
| 449 |
+
- Use Russian for Russian documents (better accuracy)
|
| 450 |
+
|
| 451 |
+
#### π§ Performance Tuning
|
| 452 |
+
- **Slow responses**: Reduce "Max tokens" from 512 to 256
|
| 453 |
+
- **Out of memory**: Use fewer "Retrieved docs" (1-3)
|
| 454 |
+
- **Better quality**: Increase "Max tokens" to 512
|
| 455 |
+
|
| 456 |
+
#### β Troubleshooting
|
| 457 |
+
- **App closes**: Reduce "Max tokens" and "Retrieved docs"
|
| 458 |
+
- **Slow processing**: First upload takes time (model loading)
|
| 459 |
+
- **Memory issues**: Use CPU mode (edit in sidebar)
|
| 460 |
+
""")
|
| 461 |
|
| 462 |
# ============================================================================
|
| 463 |
# FOOTER
|
|
|
|
| 466 |
st.divider()
|
| 467 |
st.markdown("""
|
| 468 |
<div style='text-align: center; color: #666; font-size: 0.9rem;'>
|
| 469 |
+
Multimodal RAG System with PDF Upload | Qwen2.5-VL + ChromaDB + Streamlit | v1.2
|
| 470 |
</div>
|
| 471 |
""", unsafe_allow_html=True)
|