dnj0 commited on
Commit
16691ee
Β·
verified Β·
1 Parent(s): 8099442

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +327 -195
src/app.py CHANGED
@@ -1,10 +1,11 @@
1
  import streamlit as st
2
  import os
 
3
  from pathlib import Path
 
4
  from pdf_parser import PDFParser
5
  from embedder import ChromaDBManager
6
  from rag_pipeline import RAGPipeline
7
- import torch
8
 
9
 
10
  # ============================================================================
@@ -27,16 +28,25 @@ st.markdown("""
27
  .main {
28
  padding: 2rem;
29
  }
30
- .error-box {
31
- background-color: #ffcccc;
32
- border: 1px solid #ff0000;
 
 
 
 
 
 
 
 
 
33
  border-radius: 4px;
34
  padding: 10px;
35
  margin: 10px 0;
36
  }
37
- .warning-box {
38
- background-color: #ffffcc;
39
- border: 1px solid #ffcc00;
40
  border-radius: 4px;
41
  padding: 10px;
42
  margin: 10px 0;
@@ -61,6 +71,7 @@ def initialize_system():
61
  st.error(f"Error initializing system: {e}")
62
  return None, None, None, None
63
 
 
64
  # Initialize
65
  pdf_parser, chroma_manager, rag_pipeline, device = initialize_system()
66
 
@@ -68,214 +79,233 @@ if pdf_parser is None:
68
  st.error("Failed to initialize RAG system. Please check your installation.")
69
  st.stop()
70
 
 
 
 
 
 
 
 
71
  # ============================================================================
72
  # MAIN UI
73
  # ============================================================================
74
 
75
- st.title("πŸ“„ Multimodal PDF RAG System (Improved)")
76
  st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**")
77
- st.markdown("*Fixes: Better error handling, token management, robust processing*")
78
 
79
- # Sidebar
80
- with st.sidebar:
81
- st.header("βš™οΈ Configuration")
82
-
83
- # PDF directory
84
- pdf_dir = st.text_input(
85
- "PDF Directory Path",
86
- value="./pdf_documents",
87
- help="Directory containing PDF files to process"
88
- )
89
-
90
- # Create directory if it doesn't exist
91
- os.makedirs(pdf_dir, exist_ok=True)
92
-
93
- st.divider()
94
-
95
- # Load/Refresh documents
96
- col1, col2 = st.columns(2)
97
- with col1:
98
- if st.button("πŸ“ Load PDFs", use_container_width=True):
99
- with st.spinner("Processing PDFs..."):
100
- try:
101
- documents = pdf_parser.process_pdf_directory(pdf_dir)
102
-
103
- if documents:
104
- chroma_manager.add_documents(documents)
105
- st.success(f"βœ… Loaded {len(documents)} documents!")
106
- else:
107
- st.warning("⚠️ No PDFs found in directory")
108
- except Exception as e:
109
- st.error(f"❌ Error loading PDFs: {e}")
110
-
111
- with col2:
112
- if st.button("πŸ”„ Refresh", use_container_width=True):
113
- st.rerun()
114
-
115
- st.divider()
116
-
117
- # Statistics
118
- st.subheader("πŸ“Š Statistics")
119
- try:
120
- collection_info = chroma_manager.get_collection_info()
121
- st.metric("Documents in DB", collection_info['document_count'])
122
- except Exception as e:
123
- st.warning(f"Could not load statistics: {e}")
124
-
125
- st.divider()
126
-
127
- # Device info
128
- device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
129
- st.info(f"Running on: {device_name}")
130
-
131
- # Main content with tabs
132
- tab1, tab2, tab3, tab4 = st.tabs(["πŸ” Ask Question", "πŸ“ Document Summary", "ℹ️ About", "πŸ› οΈ Database"])
133
 
134
  # ============================================================================
135
- # TAB 1: ASK QUESTIONS
136
  # ============================================================================
137
 
138
- with tab1:
139
- st.header("πŸ” Ask Questions About Your Documents")
140
 
141
  col1, col2 = st.columns([3, 1])
142
 
143
  with col1:
144
- query = st.text_input(
145
- "Enter your question (in Russian or English):",
146
- placeholder="НапримСр: КакиС ΠΊΠ»ΡŽΡ‡Π΅Π²Ρ‹Π΅ ΠΌΠΎΠΌΠ΅Π½Ρ‚Ρ‹ описаны Π² Π΄ΠΎΠΊΡƒΠΌΠ΅Π½Ρ‚Π΅?",
147
- help="Ask any question about your uploaded documents"
 
 
 
 
148
  )
149
 
150
  with col2:
151
- n_docs = st.number_input("Retrieved docs:", value=5, min_value=1, max_value=10)
152
 
153
- if st.button("πŸš€ Get Answer", use_container_width=True, type="primary"):
154
- try:
155
- collection_info = chroma_manager.get_collection_info()
156
-
157
- if collection_info['document_count'] == 0:
158
- st.warning("⚠️ No documents loaded. Please load PDFs from the sidebar first.")
159
- elif not query:
160
- st.warning("⚠️ Please enter a question.")
161
- else:
162
- with st.spinner("πŸ€– Generating answer... (this may take 10-60 seconds)"):
163
- result = rag_pipeline.answer_question(
164
- query=query,
165
- n_retrieved=n_docs,
166
- max_new_tokens=512
167
- )
168
-
169
- # Check for errors
170
- if "error" in result and result["error"]:
171
- st.error(f"⚠️ {result['error']}")
172
-
173
- # Display answer
174
- st.success("βœ… Answer Generated")
175
- st.markdown("### Answer")
176
- st.write(result['answer'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- # Display retrieved documents
179
- with st.expander("πŸ“š Retrieved Documents", expanded=False):
180
- st.markdown(f"#### {result['doc_count']} Relevant Document Chunks:")
181
- for idx, doc in enumerate(result['retrieved_docs'], 1):
182
- with st.container():
183
- col_rel, col_meta = st.columns([3, 1])
184
- with col_rel:
185
- st.markdown(f"**Document {idx}**")
186
- with col_meta:
187
- st.caption(f"Relevance: {doc['relevance_score']:.2%}")
188
-
189
- # Truncate for display
190
- preview = doc['document'][:300] + "..." if len(doc['document']) > 300 else doc['document']
191
- st.write(preview)
192
- if doc['metadata']:
193
- st.caption(f"Source: {doc['metadata'].get('filename', 'Unknown')}")
194
 
195
- except Exception as e:
196
- st.error(f"❌ Error processing question: {e}")
197
-
198
- # ============================================================================
199
- # TAB 2: DOCUMENT SUMMARY
200
- # ============================================================================
201
-
202
- with tab2:
203
- st.header("πŸ“ Document Summary")
204
- st.markdown("Generate a summary of all indexed documents")
205
-
206
- if st.button("πŸ“Š Generate Summary of All Documents", use_container_width=True, type="primary"):
207
- try:
208
  collection_info = chroma_manager.get_collection_info()
209
-
210
- if collection_info['document_count'] == 0:
211
- st.warning("⚠️ No documents loaded. Please load PDFs first.")
212
- else:
213
- with st.spinner("πŸ€– Generating summary... (this may take 20-60 seconds)"):
214
- summary = rag_pipeline.summarize_all_documents()
215
- st.markdown("### Summary")
216
- st.write(summary)
217
- except Exception as e:
218
- st.error(f"❌ Error generating summary: {e}")
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  # ============================================================================
221
- # TAB 3: ABOUT
222
  # ============================================================================
223
 
224
- with tab3:
225
- st.header("ℹ️ About This System")
226
-
227
- st.markdown("""
228
- ### Overview
229
- This is an **improved Local Multimodal RAG System** with enhanced error handling and token management.
230
-
231
- ### Key Improvements (Fixed Version)
232
- βœ… **Token Management**: Automatic context truncation to prevent model errors
233
- βœ… **Error Handling**: Comprehensive try-catch blocks throughout
234
- βœ… **Image Extraction**: Fixed PyMuPDF xref handling
235
- βœ… **Better Limits**: Resource limits on text, tables, and images
236
- βœ… **Performance**: Optimized for large PDFs (400+ pages)
237
- βœ… **Robustness**: Graceful degradation on errors
238
-
239
- ### Core Features
240
- - **πŸ“„ PDF Processing**: Text, tables, and images extraction
241
- - **πŸ” Vector Search**: ChromaDB with CLIP embeddings
242
- - **πŸ€– AI Generation**: Qwen2.5-VL-3B model
243
- - **🌐 Russian Support**: Full support for Russian language
244
- - **πŸ’Ύ Persistent Storage**: Local ChromaDB database
245
- - **⚑ Lightweight**: Runs on consumer hardware
246
-
247
- ### Technology Stack
248
- - **LLM Model**: Qwen2.5-VL-3B-Instruct
249
- - **Embeddings**: CLIP (clip-vit-base-patch32)
250
- - **Vector DB**: ChromaDB with persistent storage
251
- - **UI**: Streamlit
252
- - **PDF Tools**: pdfplumber + PyMuPDF
253
-
254
- ### System Requirements
255
- - Python 3.9+
256
- - RAM: 8GB minimum (12GB+ recommended)
257
- - Storage: 15GB for models
258
- - GPU optional (CUDA for faster inference)
259
 
260
- ### Performance
261
- - Model Load: ~30 seconds
262
- - Query Response (CPU): 20-60 seconds
263
- - Query Response (GPU): 5-15 seconds
264
- - PDF Processing: 1-2 seconds per page
265
 
266
- ### What's Fixed
267
- - βœ… Token limit errors (uses chunking + truncation)
268
- - βœ… Image extraction errors (proper xref handling)
269
- - βœ… Memory issues (resource limits on text/tables/images)
270
- - βœ… PyTorch GPU loading (fbgemm.dll issues)
271
- - βœ… Error reporting (detailed error messages)
272
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  # ============================================================================
275
- # TAB 4: DATABASE MANAGEMENT
276
  # ============================================================================
277
 
278
- with tab4:
279
  st.header("πŸ› οΈ Database Management")
280
 
281
  col1, col2, col3 = st.columns(3)
@@ -294,10 +324,17 @@ with tab4:
294
  all_docs = chroma_manager.collection.get(include=['documents'])
295
  if all_docs['ids']:
296
  st.write(f"Total documents: {len(all_docs['ids'])}")
297
- for idx, doc_id in enumerate(all_docs['ids'][:15], 1):
298
- st.write(f"{idx}. {doc_id}")
299
- if len(all_docs['ids']) > 15:
300
- st.write(f"... and {len(all_docs['ids']) - 15} more")
 
 
 
 
 
 
 
301
  else:
302
  st.info("No documents in database")
303
  except Exception as e:
@@ -318,14 +355,109 @@ with tab4:
318
 
319
  st.divider()
320
 
321
- st.markdown("### Quick Stats")
322
- stats_col1, stats_col2 = st.columns(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
- with stats_col1:
325
- st.metric("PDF Extraction Dir", "./pdf_extractions")
 
 
 
326
 
327
- with stats_col2:
328
- st.metric("ChromaDB Location", "./chroma_db")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  # ============================================================================
331
  # FOOTER
@@ -334,6 +466,6 @@ with tab4:
334
  st.divider()
335
  st.markdown("""
336
  <div style='text-align: center; color: #666; font-size: 0.9rem;'>
337
- Multimodal RAG System (Improved) | Qwen2.5-VL + ChromaDB + Streamlit | v1.1
338
  </div>
339
  """, unsafe_allow_html=True)
 
1
  import streamlit as st
2
  import os
3
+ import tempfile
4
  from pathlib import Path
5
+ import torch
6
  from pdf_parser import PDFParser
7
  from embedder import ChromaDBManager
8
  from rag_pipeline import RAGPipeline
 
9
 
10
 
11
  # ============================================================================
 
28
  .main {
29
  padding: 2rem;
30
  }
31
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
32
+ font-size: 1.2rem;
33
+ }
34
+ .upload-area {
35
+ border: 2px dashed #ccc;
36
+ border-radius: 5px;
37
+ padding: 20px;
38
+ text-align: center;
39
+ }
40
+ .success-box {
41
+ background-color: #d4edda;
42
+ border: 1px solid #28a745;
43
  border-radius: 4px;
44
  padding: 10px;
45
  margin: 10px 0;
46
  }
47
+ .error-box {
48
+ background-color: #f8d7da;
49
+ border: 1px solid #f5c6cb;
50
  border-radius: 4px;
51
  padding: 10px;
52
  margin: 10px 0;
 
71
  st.error(f"Error initializing system: {e}")
72
  return None, None, None, None
73
 
74
+
75
  # Initialize
76
  pdf_parser, chroma_manager, rag_pipeline, device = initialize_system()
77
 
 
79
  st.error("Failed to initialize RAG system. Please check your installation.")
80
  st.stop()
81
 
82
+ # Initialize session state for uploaded files
83
+ if 'uploaded_files' not in st.session_state:
84
+ st.session_state.uploaded_files = []
85
+
86
+ if 'processing_status' not in st.session_state:
87
+ st.session_state.processing_status = {}
88
+
89
  # ============================================================================
90
  # MAIN UI
91
  # ============================================================================
92
 
93
+ st.title("πŸ“„ Multimodal PDF RAG System")
94
  st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**")
95
+ st.markdown("*Upload PDFs directly and ask questions about them*")
96
 
97
+ # Create main tabs
98
+ tab_upload, tab_query, tab_manage, tab_about = st.tabs(["πŸ“€ Upload PDFs", "πŸ” Ask Questions", "πŸ› οΈ Manage", "ℹ️ About"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  # ============================================================================
101
+ # TAB 1: UPLOAD PDFs
102
  # ============================================================================
103
 
104
+ with tab_upload:
105
+ st.header("πŸ“€ Upload PDF Documents")
106
 
107
  col1, col2 = st.columns([3, 1])
108
 
109
  with col1:
110
+ st.markdown("**Upload your PDF files below. They will be automatically processed and stored.**")
111
+
112
+ # File uploader
113
+ uploaded_files = st.file_uploader(
114
+ "Choose PDF files",
115
+ type=["pdf"],
116
+ accept_multiple_files=True,
117
+ help="You can upload multiple PDF files at once"
118
  )
119
 
120
  with col2:
121
+ st.info(f"πŸ“Š Documents in DB: {chroma_manager.get_collection_info()['document_count']}")
122
 
123
+ # Process uploaded files
124
+ if uploaded_files:
125
+ st.divider()
126
+ st.subheader("Processing Uploaded Files")
127
+
128
+ # Create a temporary directory for uploads
129
+ temp_dir = tempfile.mkdtemp()
130
+
131
+ progress_bar = st.progress(0)
132
+ status_text = st.empty()
133
+ results_container = st.container()
134
+
135
+ total_files = len(uploaded_files)
136
+ processed_files = []
137
+ failed_files = []
138
+
139
+ for idx, uploaded_file in enumerate(uploaded_files):
140
+ try:
141
+ # Update progress
142
+ status_text.text(f"Processing {idx + 1}/{total_files}: {uploaded_file.name}")
143
+
144
+ # Save uploaded file to temp directory
145
+ temp_file_path = os.path.join(temp_dir, uploaded_file.name)
146
+ with open(temp_file_path, "wb") as f:
147
+ f.write(uploaded_file.getbuffer())
148
+
149
+ # Process PDF
150
+ with st.spinner(f"Extracting content from {uploaded_file.name}..."):
151
+ try:
152
+ result = pdf_parser.process_pdf(temp_file_path)
153
+
154
+ # Add to ChromaDB
155
+ chroma_manager.add_documents([result])
156
+
157
+ processed_files.append({
158
+ 'name': uploaded_file.name,
159
+ 'size': uploaded_file.size,
160
+ 'text_length': len(result.get('text', '')),
161
+ 'tables': len(result.get('tables', [])),
162
+ 'images': len(result.get('images', []))
163
+ })
164
+
165
+ st.success(f"βœ… {uploaded_file.name} processed successfully")
166
 
167
+ except Exception as e:
168
+ failed_files.append({
169
+ 'name': uploaded_file.name,
170
+ 'error': str(e)
171
+ })
172
+ st.error(f"❌ Error processing {uploaded_file.name}: {e}")
173
+
174
+ # Update progress
175
+ progress_bar.progress((idx + 1) / total_files)
176
+
177
+ except Exception as e:
178
+ failed_files.append({
179
+ 'name': uploaded_file.name,
180
+ 'error': str(e)
181
+ })
182
+ st.error(f"❌ Error with {uploaded_file.name}: {e}")
183
 
184
+ # Show summary
185
+ st.divider()
186
+ st.subheader("Upload Summary")
187
+
188
+ col1, col2, col3 = st.columns(3)
189
+
190
+ with col1:
191
+ st.metric("Successfully Processed", len(processed_files))
192
+
193
+ with col2:
194
+ st.metric("Failed", len(failed_files))
195
+
196
+ with col3:
197
  collection_info = chroma_manager.get_collection_info()
198
+ st.metric("Total in Database", collection_info['document_count'])
199
+
200
+ # Show details of processed files
201
+ if processed_files:
202
+ st.markdown("#### βœ… Processed Files:")
203
+ for file_info in processed_files:
204
+ col1, col2, col3, col4 = st.columns(4)
205
+ with col1:
206
+ st.text(file_info['name'])
207
+ with col2:
208
+ st.text(f"{file_info['size'] / 1024:.1f} KB")
209
+ with col3:
210
+ st.text(f"{file_info['text_length']:,} chars")
211
+ with col4:
212
+ st.text(f"{file_info['tables']} tables, {file_info['images']} imgs")
213
+
214
+ # Show failed files
215
+ if failed_files:
216
+ st.markdown("#### ❌ Failed Files:")
217
+ for file_info in failed_files:
218
+ st.error(f"**{file_info['name']}**: {file_info['error']}")
219
 
220
  # ============================================================================
221
+ # TAB 2: ASK QUESTIONS
222
  # ============================================================================
223
 
224
+ with tab_query:
225
+ st.header("πŸ” Ask Questions About Your Documents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ collection_info = chroma_manager.get_collection_info()
 
 
 
 
228
 
229
+ if collection_info['document_count'] == 0:
230
+ st.warning("⚠️ No documents uploaded yet. Please upload PDFs in the 'Upload PDFs' tab first.")
231
+ else:
232
+ st.success(f"βœ… {collection_info['document_count']} documents in database")
233
+
234
+ col1, col2, col3 = st.columns([2, 1, 1])
235
+
236
+ with col1:
237
+ query = st.text_input(
238
+ "Enter your question:",
239
+ placeholder="НапримСр: КакиС ΠΊΠ»ΡŽΡ‡Π΅Π²Ρ‹Π΅ ΠΌΠΎΠΌΠ΅Π½Ρ‚Ρ‹ описаны Π² Π΄ΠΎΠΊΡƒΠΌΠ΅Π½Ρ‚Π΅?",
240
+ help="Ask any question about your uploaded documents"
241
+ )
242
+
243
+ with col2:
244
+ n_docs = st.number_input("Retrieved docs:", value=3, min_value=1, max_value=10)
245
+
246
+ with col3:
247
+ max_tokens = st.number_input("Max tokens:", value=256, min_value=128, max_value=512, step=128)
248
+
249
+ if st.button("πŸš€ Get Answer", use_container_width=True, type="primary"):
250
+ if not query:
251
+ st.warning("⚠️ Please enter a question.")
252
+ else:
253
+ try:
254
+ with st.spinner("πŸ€– Generating answer... (this may take 10-30 seconds)"):
255
+ st.info("Processing query - please wait...")
256
+
257
+ # Generate answer with error handling
258
+ try:
259
+ result = rag_pipeline.answer_question(
260
+ query=query,
261
+ n_retrieved=n_docs,
262
+ max_new_tokens=max_tokens
263
+ )
264
+
265
+ # Check for errors
266
+ if "error" in result and result["error"]:
267
+ st.error(f"⚠️ {result['error']}")
268
+
269
+ # Display answer
270
+ st.success("βœ… Answer Generated")
271
+ st.markdown("### Answer")
272
+ st.write(result['answer'])
273
+
274
+ # Display retrieved documents
275
+ with st.expander("πŸ“š Retrieved Documents", expanded=False):
276
+ st.markdown(f"#### {result['doc_count']} Relevant Document Chunks:")
277
+ for idx, doc in enumerate(result['retrieved_docs'], 1):
278
+ with st.container():
279
+ col_rel, col_score = st.columns([3, 1])
280
+ with col_rel:
281
+ st.markdown(f"**Document {idx}**")
282
+ with col_score:
283
+ st.caption(f"Score: {doc['relevance_score']:.1%}")
284
+
285
+ # Truncate for display
286
+ preview = doc['document'][:400]
287
+ if len(doc['document']) > 400:
288
+ preview += "..."
289
+ st.write(preview)
290
+
291
+ if doc['metadata']:
292
+ st.caption(f"Source: {doc['metadata'].get('filename', 'Unknown')}")
293
+
294
+ except Exception as e:
295
+ st.error(f"❌ Error during generation: {e}")
296
+ st.info("Possible causes:")
297
+ st.write("- Out of memory (try reducing 'Max tokens' or 'Retrieved docs')")
298
+ st.write("- Model inference timeout")
299
+ st.write("- Invalid input format")
300
+
301
+ except Exception as e:
302
+ st.error(f"❌ Unexpected error: {e}")
303
 
304
  # ============================================================================
305
+ # TAB 3: MANAGE DATABASE
306
  # ============================================================================
307
 
308
+ with tab_manage:
309
  st.header("πŸ› οΈ Database Management")
310
 
311
  col1, col2, col3 = st.columns(3)
 
324
  all_docs = chroma_manager.collection.get(include=['documents'])
325
  if all_docs['ids']:
326
  st.write(f"Total documents: {len(all_docs['ids'])}")
327
+
328
+ col1_list, col2_list = st.columns(2)
329
+
330
+ with col1_list:
331
+ st.write("**First 10:**")
332
+ for idx, doc_id in enumerate(all_docs['ids'][:10], 1):
333
+ st.write(f"{idx}. {doc_id[:50]}...")
334
+
335
+ with col2_list:
336
+ if len(all_docs['ids']) > 10:
337
+ st.write(f"**... and {len(all_docs['ids']) - 10} more**")
338
  else:
339
  st.info("No documents in database")
340
  except Exception as e:
 
355
 
356
  st.divider()
357
 
358
+ st.markdown("### Storage Information")
359
+ col1, col2 = st.columns(2)
360
+
361
+ with col1:
362
+ extraction_size = sum(
363
+ os.path.getsize(os.path.join(dirpath, filename))
364
+ for dirpath, dirnames, filenames in os.walk("./pdf_extractions")
365
+ for filename in filenames
366
+ ) / (1024 * 1024) if os.path.exists("./pdf_extractions") else 0
367
+ st.metric("PDF Extractions", f"{extraction_size:.1f} MB")
368
+
369
+ with col2:
370
+ chroma_size = sum(
371
+ os.path.getsize(os.path.join(dirpath, filename))
372
+ for dirpath, dirnames, filenames in os.walk("./chroma_db")
373
+ for filename in filenames
374
+ ) / (1024 * 1024) if os.path.exists("./chroma_db") else 0
375
+ st.metric("ChromaDB Storage", f"{chroma_size:.1f} MB")
376
+
377
+ # ============================================================================
378
+ # TAB 4: ABOUT
379
+ # ============================================================================
380
+
381
+ with tab_about:
382
+ st.header("ℹ️ About This System")
383
+
384
+ st.markdown("""
385
+ ### Multimodal RAG System with PDF Upload
386
+
387
+ This is a **local, privacy-first AI document analysis system** that allows you to:
388
+
389
+ #### ✨ Features
390
+ - **πŸ“€ Easy PDF Upload**: Drag & drop or select multiple PDF files
391
+ - **πŸ” Smart Search**: Semantic search across documents with CLIP embeddings
392
+ - **πŸ€– AI-Powered Answers**: Ask questions and get answers from Qwen2.5-VL-3B
393
+ - **🌐 Russian & English**: Full support for both languages
394
+ - **πŸ’Ύ Local Storage**: All data stays on your machine
395
+ - **⚑ Fast Processing**: Automatic caching to avoid re-processing
396
+
397
+ #### πŸ—οΈ How It Works
398
+ 1. Upload PDF documents
399
+ 2. System extracts text, tables, and images
400
+ 3. Content is embedded with CLIP and stored in ChromaDB
401
+ 4. Ask questions about your documents
402
+ 5. AI retrieves relevant sections and generates answers
403
+
404
+ #### πŸ” Privacy & Security
405
+ - βœ… All processing happens locally
406
+ - βœ… No internet required (after model download)
407
+ - βœ… No cloud APIs used
408
+ - βœ… Full data control
409
+ - βœ… Open-source code
410
+
411
+ #### πŸ’» Technology Stack
412
+ - **LLM**: Qwen2.5-VL-3B (multimodal)
413
+ - **Embeddings**: CLIP (clip-vit-base-patch32)
414
+ - **Vector DB**: ChromaDB
415
+ - **UI**: Streamlit
416
+ - **PDF Processing**: pdfplumber + PyMuPDF
417
+
418
+ #### πŸ“Š System Info
419
+ """)
420
+
421
+ col1, col2, col3 = st.columns(3)
422
+
423
+ with col1:
424
+ device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
425
+ st.metric("Device", device_name)
426
+
427
+ with col2:
428
+ collection_info = chroma_manager.get_collection_info()
429
+ st.metric("Documents in DB", collection_info['document_count'])
430
+
431
+ with col3:
432
+ st.metric("Version", "1.2 (Upload)")
433
+
434
+ st.divider()
435
+
436
+ st.markdown("""
437
+ #### πŸ“ How to Use
438
 
439
+ 1. **Upload PDFs**: Go to the "Upload PDFs" tab and select your files
440
+ 2. **Wait for Processing**: System automatically extracts content
441
+ 3. **Ask Questions**: Switch to "Ask Questions" tab and type your query
442
+ 4. **Review Results**: See generated answers and relevant document chunks
443
+ 5. **Manage**: Use "Manage" tab to view or clear database
444
 
445
+ #### βš™οΈ Tips for Best Results
446
+ - Start with smaller PDFs to test
447
+ - Ask specific questions for better answers
448
+ - Reduce "Retrieved docs" if responses are slow
449
+ - Use Russian for Russian documents (better accuracy)
450
+
451
+ #### πŸ”§ Performance Tuning
452
+ - **Slow responses**: Reduce "Max tokens" from 512 to 256
453
+ - **Out of memory**: Use fewer "Retrieved docs" (1-3)
454
+ - **Better quality**: Increase "Max tokens" to 512
455
+
456
+ #### ❓ Troubleshooting
457
+ - **App closes**: Reduce "Max tokens" and "Retrieved docs"
458
+ - **Slow processing**: First upload takes time (model loading)
459
+ - **Memory issues**: Use CPU mode (edit in sidebar)
460
+ """)
461
 
462
  # ============================================================================
463
  # FOOTER
 
466
  st.divider()
467
  st.markdown("""
468
  <div style='text-align: center; color: #666; font-size: 0.9rem;'>
469
+ Multimodal RAG System with PDF Upload | Qwen2.5-VL + ChromaDB + Streamlit | v1.2
470
  </div>
471
  """, unsafe_allow_html=True)