dnj0 commited on
Commit
21f3961
Β·
verified Β·
1 Parent(s): 483d0df

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +306 -267
src/app.py CHANGED
@@ -1,267 +1,306 @@
1
- import streamlit as st
2
- import os
3
- from pathlib import Path
4
- from rag_pipeline import RAGPipeline
5
- import shutil
6
-
7
- # Page configuration
8
- st.set_page_config(
9
- page_title="Local Multimodal RAG",
10
- page_icon="πŸ“š",
11
- layout="wide",
12
- initial_sidebar_state="expanded"
13
- )
14
-
15
- st.title("πŸ“š Local Multimodal RAG System")
16
- st.markdown("**Analyze PDF documents locally with Mistral + CLIP embeddings**")
17
-
18
- # Initialize session state
19
- if "uploaded_files" not in st.session_state:
20
- st.session_state.uploaded_files = []
21
- if "rag_pipeline" not in st.session_state:
22
- st.session_state.rag_pipeline = None
23
- if "needs_reindex" not in st.session_state:
24
- st.session_state.needs_reindex = False
25
-
26
- # Sidebar configuration
27
- with st.sidebar:
28
- st.header("βš™οΈ Configuration")
29
-
30
- pdf_dir = st.text_input(
31
- "πŸ“ PDF Directory",
32
- value="./pdfs",
33
- help="Path to directory containing PDF files"
34
- )
35
-
36
- device = st.selectbox(
37
- "πŸ–₯️ Device",
38
- ["cpu", "cuda"],
39
- help="Device for model inference"
40
- )
41
-
42
- n_context_docs = st.slider(
43
- "πŸ“„ Context Documents",
44
- min_value=1,
45
- max_value=10,
46
- value=3,
47
- help="Number of documents to retrieve for context"
48
- )
49
-
50
- st.divider()
51
-
52
- # PDF Upload Section
53
- st.subheader("πŸ“€ Upload PDF Files")
54
-
55
- uploaded_pdfs = st.file_uploader(
56
- "Choose PDF files to upload",
57
- type="pdf",
58
- accept_multiple_files=True,
59
- help="Select one or more PDF files to add to the system"
60
- )
61
-
62
- if uploaded_pdfs:
63
- # Create PDF directory if not exists
64
- os.makedirs(pdf_dir, exist_ok=True)
65
-
66
- upload_button = st.button("⬆️ Upload PDFs", use_container_width=True)
67
-
68
- if upload_button:
69
- uploaded_count = 0
70
- for uploaded_file in uploaded_pdfs:
71
- file_path = os.path.join(pdf_dir, uploaded_file.name)
72
-
73
- # Save file
74
- with open(file_path, "wb") as f:
75
- f.write(uploaded_file.getbuffer())
76
-
77
- st.session_state.uploaded_files.append(uploaded_file.name)
78
- uploaded_count += 1
79
-
80
- st.success(f"βœ… Uploaded {uploaded_count} PDF(s) successfully!")
81
- st.session_state.needs_reindex = True
82
-
83
- st.divider()
84
-
85
- # Display uploaded files
86
- pdf_files = list(Path(pdf_dir).glob("*.pdf"))
87
- if pdf_files:
88
- st.subheader(f"πŸ“š Documents ({len(pdf_files)})")
89
- for pdf_file in pdf_files:
90
- col1, col2 = st.columns([4, 1])
91
- with col1:
92
- st.write(f"β€’ {pdf_file.name}")
93
- with col2:
94
- if st.button("πŸ—‘οΈ", key=f"delete_{pdf_file.name}", help="Delete this file"):
95
- os.remove(pdf_file)
96
- st.session_state.needs_reindex = True
97
- st.rerun()
98
-
99
- st.divider()
100
-
101
- # Reindex button
102
- if st.button("πŸ”„ Reload & Index PDFs", use_container_width=True):
103
- st.session_state.rag_pipeline = None
104
- st.session_state.needs_reindex = True
105
- st.rerun()
106
-
107
-
108
- # Initialize pipeline in session state
109
- @st.cache_resource
110
- def init_rag_pipeline(_device, _pdf_dir):
111
- """Initialize RAG pipeline (cached)"""
112
- # Create PDF directory if not exists
113
- os.makedirs(_pdf_dir, exist_ok=True)
114
-
115
- # Check if PDFs exist
116
- pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
117
- if not pdf_files:
118
- return None, f"No PDF files found in {_pdf_dir}. Upload PDFs using the sidebar."
119
-
120
- try:
121
- with st.spinner("⏳ Initializing RAG pipeline..."):
122
- pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
123
- with st.spinner("⏳ Indexing PDFs..."):
124
- pipeline.index_pdfs()
125
- return pipeline, None
126
- except Exception as e:
127
- return None, str(e)
128
-
129
-
130
- # Get or initialize pipeline
131
- if st.session_state.rag_pipeline is None or st.session_state.needs_reindex:
132
- pipeline, error = init_rag_pipeline(device, pdf_dir)
133
- if error:
134
- st.error(f"❌ Error: {error}")
135
- st.info("πŸ’‘ **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")
136
- st.stop()
137
- st.session_state.rag_pipeline = pipeline
138
- st.session_state.needs_reindex = False
139
- else:
140
- pipeline = st.session_state.rag_pipeline
141
-
142
- # Main content
143
- if pipeline:
144
- # Tabs
145
- tab1, tab2, tab3 = st.tabs(["❓ Q&A", "πŸ“Š Summary", "πŸ“– Retrieval"])
146
-
147
- # Tab 1: Question Answering
148
- with tab1:
149
- st.subheader("Ask Questions about Your Documents")
150
-
151
- question = st.text_area(
152
- "Your question (in Russian or English):",
153
- height=100,
154
- placeholder="What is this document about? What are the main points? Etc.",
155
- key="qa_question"
156
- )
157
-
158
- col1, col2 = st.columns(2)
159
- with col1:
160
- get_answer_btn = st.button("πŸ” Get Answer", use_container_width=True)
161
- with col2:
162
- clear_btn = st.button("πŸ—‘οΈ Clear", use_container_width=True)
163
-
164
- if clear_btn:
165
- st.rerun()
166
-
167
- if get_answer_btn:
168
- if question.strip():
169
- with st.spinner("⏳ Retrieving documents and generating answer..."):
170
- try:
171
- result = pipeline.answer_question(question, n_context_docs=n_context_docs)
172
- except Exception as e:
173
- st.error(f"Error generating answer: {str(e)}")
174
- result = None
175
-
176
- if result and result.get("answer"):
177
- st.success("βœ“ Answer generated!")
178
-
179
- # Display answer
180
- st.subheader("πŸ“ Answer")
181
- st.write(result["answer"])
182
-
183
- # Display sources
184
- with st.expander("πŸ“š Sources Used"):
185
- for i, source in enumerate(result["sources"], 1):
186
- st.write(f"{i}. {source}")
187
-
188
- # Display stats
189
- col1, col2 = st.columns(2)
190
- with col1:
191
- st.metric("Documents Used", result.get("context_used", 0))
192
- with col2:
193
- st.metric("Answer Length", len(result["answer"]))
194
- else:
195
- st.warning("Please enter a question")
196
-
197
- # Tab 2: Document Summary
198
- with tab2:
199
- st.subheader("Summary of Indexed Documents")
200
-
201
- if st.button("πŸ“Š Generate Summary", use_container_width=True):
202
- with st.spinner("⏳ Generating summary..."):
203
- try:
204
- summary = pipeline.summarize_documents()
205
- st.success("βœ“ Summary generated!")
206
- st.subheader("πŸ“„ Document Summary")
207
- st.write(summary)
208
- except Exception as e:
209
- st.error(f"Error generating summary: {str(e)}")
210
-
211
- # Tab 3: Document Retrieval
212
- with tab3:
213
- st.subheader("Search and Retrieve Documents")
214
-
215
- search_query = st.text_input(
216
- "Search query:",
217
- placeholder="Enter search terms...",
218
- key="retrieval_search"
219
- )
220
-
221
- col1, col2 = st.columns(2)
222
- with col1:
223
- search_btn = st.button("πŸ”Ž Search", use_container_width=True)
224
- with col2:
225
- clear_search_btn = st.button("Clear Search", use_container_width=True)
226
-
227
- if clear_search_btn:
228
- st.rerun()
229
-
230
- if search_btn:
231
- if search_query.strip():
232
- with st.spinner("⏳ Searching..."):
233
- try:
234
- results = pipeline.retrieve_documents(search_query, n_results=n_context_docs)
235
- except Exception as e:
236
- st.error(f"Search error: {str(e)}")
237
- results = []
238
-
239
- if results:
240
- st.success(f"βœ“ Found {len(results)} documents")
241
-
242
- for i, doc in enumerate(results, 1):
243
- with st.expander(f"πŸ“„ Document {i} - {doc['source']}", expanded=(i==1)):
244
- st.write(doc["content"])
245
- else:
246
- st.warning("No documents found matching your query")
247
- else:
248
- st.warning("Please enter a search query")
249
-
250
- # Footer
251
- st.divider()
252
- with st.expander("ℹ️ System Information"):
253
- info = pipeline.vector_store.get_collection_info()
254
- col1, col2, col3, col4 = st.columns(4)
255
- with col1:
256
- st.metric("πŸ“š Documents", info.get("document_count", 0))
257
- with col2:
258
- st.metric("πŸ–₯️ Device", device.upper())
259
- with col3:
260
- st.metric("πŸ” Context Docs", n_context_docs)
261
- with col4:
262
- pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
263
- st.metric("πŸ“ PDF Files", pdf_count)
264
-
265
- else:
266
- st.error("❌ Failed to initialize RAG pipeline")
267
- st.info("πŸ’‘ **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from pathlib import Path
4
+ from rag_pipeline import RAGPipeline
5
+ import time
6
+
7
+ # Page configuration
8
+ st.set_page_config(
9
+ page_title="Local Multimodal RAG",
10
+ page_icon="πŸ“š",
11
+ layout="wide",
12
+ initial_sidebar_state="expanded"
13
+ )
14
+
15
+ st.title("πŸ“š Local Multimodal RAG System")
16
+ st.markdown("**Analyze PDF documents locally with Mistral + CLIP embeddings**")
17
+
18
+ # Initialize session state
19
+ if "uploaded_files" not in st.session_state:
20
+ st.session_state.uploaded_files = []
21
+ if "rag_pipeline" not in st.session_state:
22
+ st.session_state.rag_pipeline = None
23
+ if "last_upload_time" not in st.session_state:
24
+ st.session_state.last_upload_time = 0
25
+
26
+ # Sidebar configuration
27
+ with st.sidebar:
28
+ st.header("βš™οΈ Configuration")
29
+
30
+ pdf_dir = st.text_input(
31
+ "πŸ“ PDF Directory",
32
+ value="./pdfs",
33
+ help="Path to directory containing PDF files"
34
+ )
35
+
36
+ # Ensure directory exists
37
+ os.makedirs(pdf_dir, exist_ok=True)
38
+
39
+ device = st.selectbox(
40
+ "πŸ–₯️ Device",
41
+ ["cpu", "cuda"],
42
+ help="Device for model inference"
43
+ )
44
+
45
+ n_context_docs = st.slider(
46
+ "πŸ“„ Context Documents",
47
+ min_value=1,
48
+ max_value=10,
49
+ value=3,
50
+ help="Number of documents to retrieve for context"
51
+ )
52
+
53
+ st.divider()
54
+
55
+ # PDF Upload Section with Form
56
+ st.subheader("πŸ“€ Upload PDF Files")
57
+
58
+ # Use a form to separate file upload from submission
59
+ with st.form("pdf_upload_form", clear_on_submit=True):
60
+ uploaded_pdfs = st.file_uploader(
61
+ "Choose PDF files to upload",
62
+ type="pdf",
63
+ accept_multiple_files=True,
64
+ help="Select one or more PDF files to add to the system"
65
+ )
66
+
67
+ submit_button = st.form_submit_button("⬆️ Upload PDFs", use_container_width=True)
68
+
69
+ if submit_button and uploaded_pdfs:
70
+ upload_successful = True
71
+ uploaded_count = 0
72
+
73
+ for uploaded_file in uploaded_pdfs:
74
+ try:
75
+ file_path = os.path.join(pdf_dir, uploaded_file.name)
76
+
77
+ # Save file to disk
78
+ with open(file_path, "wb") as f:
79
+ f.write(uploaded_file.getbuffer())
80
+
81
+ st.session_state.uploaded_files.append(uploaded_file.name)
82
+ uploaded_count += 1
83
+
84
+ except Exception as e:
85
+ st.error(f"Failed to upload {uploaded_file.name}: {str(e)}")
86
+ upload_successful = False
87
+
88
+ if upload_successful and uploaded_count > 0:
89
+ st.session_state.last_upload_time = time.time()
90
+ st.success(f"βœ… Uploaded {uploaded_count} PDF(s) successfully!")
91
+ st.info("πŸ“Œ Click 'Reload & Index PDFs' below to process them.")
92
+ # Don't call st.rerun() here - let form handle clear_on_submit
93
+
94
+ st.divider()
95
+
96
+ # Display uploaded files
97
+ pdf_files = list(Path(pdf_dir).glob("*.pdf"))
98
+ if pdf_files:
99
+ st.subheader(f"πŸ“š Documents ({len(pdf_files)})")
100
+
101
+ for pdf_file in pdf_files:
102
+ col1, col2 = st.columns([4, 1])
103
+ with col1:
104
+ st.write(f"β€’ {pdf_file.name}")
105
+ with col2:
106
+ if st.button("πŸ—‘οΈ", key=f"delete_{pdf_file.name}", help="Delete this file"):
107
+ try:
108
+ os.remove(pdf_file)
109
+ st.session_state.rag_pipeline = None # Clear pipeline
110
+ st.success(f"Deleted {pdf_file.name}")
111
+ time.sleep(0.5)
112
+ st.rerun()
113
+ except Exception as e:
114
+ st.error(f"Failed to delete: {str(e)}")
115
+ else:
116
+ st.info("πŸ“­ No PDF files in directory yet")
117
+
118
+ st.divider()
119
+
120
+ # Reload/Index button
121
+ col1, col2 = st.columns(2)
122
+ with col1:
123
+ if st.button("πŸ”„ Reload & Index", use_container_width=True):
124
+ st.session_state.rag_pipeline = None # Clear cached pipeline
125
+ st.rerun()
126
+
127
+ with col2:
128
+ if st.button("πŸ—‘οΈ Clear All", use_container_width=True):
129
+ # Delete all PDFs
130
+ for pdf_file in Path(pdf_dir).glob("*.pdf"):
131
+ try:
132
+ os.remove(pdf_file)
133
+ except:
134
+ pass
135
+ st.session_state.rag_pipeline = None
136
+ st.session_state.uploaded_files = []
137
+ st.success("All PDFs cleared")
138
+ time.sleep(0.5)
139
+ st.rerun()
140
+
141
+
142
+ # Initialize pipeline
143
+ @st.cache_resource
144
+ def init_rag_pipeline(_device, _pdf_dir):
145
+ """Initialize RAG pipeline (cached)"""
146
+ os.makedirs(_pdf_dir, exist_ok=True)
147
+
148
+ pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
149
+ if not pdf_files:
150
+ return None, f"No PDF files found in {_pdf_dir}"
151
+
152
+ try:
153
+ with st.spinner("⏳ Initializing models..."):
154
+ pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
155
+
156
+ with st.spinner("⏳ Indexing PDFs..."):
157
+ pipeline.index_pdfs()
158
+
159
+ return pipeline, None
160
+ except Exception as e:
161
+ return None, str(e)
162
+
163
+
164
+ # Get or initialize pipeline
165
+ if st.session_state.rag_pipeline is None:
166
+ pdf_files = list(Path(pdf_dir).glob("*.pdf"))
167
+
168
+ if pdf_files:
169
+ pipeline, error = init_rag_pipeline(device, pdf_dir)
170
+ if error:
171
+ st.error(f"❌ Error: {error}")
172
+ st.stop()
173
+ st.session_state.rag_pipeline = pipeline
174
+ else:
175
+ st.warning("πŸ“­ No PDF files found")
176
+ st.info("""
177
+ **How to get started:**
178
+ 1. πŸ“€ Upload PDF files using the sidebar file uploader
179
+ 2. βœ… Click 'Upload PDFs' to save them
180
+ 3. πŸ”„ Click 'Reload & Index PDFs' to process
181
+ 4. ❓ Ask questions in the Q&A tab
182
+ """)
183
+ st.stop()
184
+ else:
185
+ pipeline = st.session_state.rag_pipeline
186
+
187
+
188
+ # Main content
189
+ if pipeline:
190
+ # Tabs
191
+ tab1, tab2, tab3 = st.tabs(["❓ Q&A", "πŸ“Š Summary", "πŸ“– Retrieval"])
192
+
193
+ # Tab 1: Question Answering
194
+ with tab1:
195
+ st.subheader("Ask Questions about Your Documents")
196
+
197
+ question = st.text_area(
198
+ "Your question (in Russian or English):",
199
+ height=100,
200
+ placeholder="What is this document about? What are the main points? Etc.",
201
+ key="qa_question"
202
+ )
203
+
204
+ col1, col2 = st.columns(2)
205
+ with col1:
206
+ get_answer_btn = st.button("πŸ” Get Answer", use_container_width=True)
207
+ with col2:
208
+ clear_btn = st.button("πŸ—‘οΈ Clear", use_container_width=True)
209
+
210
+ if clear_btn:
211
+ st.rerun()
212
+
213
+ if get_answer_btn:
214
+ if question.strip():
215
+ with st.spinner("⏳ Retrieving documents and generating answer..."):
216
+ try:
217
+ result = pipeline.answer_question(question, n_context_docs=n_context_docs)
218
+ except Exception as e:
219
+ st.error(f"Error: {str(e)}")
220
+ result = None
221
+
222
+ if result and result.get("answer"):
223
+ st.success("βœ“ Answer generated!")
224
+
225
+ st.subheader("πŸ“ Answer")
226
+ st.write(result["answer"])
227
+
228
+ with st.expander("πŸ“š Sources Used"):
229
+ for i, source in enumerate(result["sources"], 1):
230
+ st.write(f"{i}. {source}")
231
+
232
+ col1, col2 = st.columns(2)
233
+ with col1:
234
+ st.metric("Documents Used", result.get("context_used", 0))
235
+ with col2:
236
+ st.metric("Answer Length", len(result["answer"]))
237
+ else:
238
+ st.warning("Please enter a question")
239
+
240
+ # Tab 2: Document Summary
241
+ with tab2:
242
+ st.subheader("Summary of Indexed Documents")
243
+
244
+ if st.button("πŸ“Š Generate Summary", use_container_width=True):
245
+ with st.spinner("⏳ Generating summary..."):
246
+ try:
247
+ summary = pipeline.summarize_documents()
248
+ st.success("βœ“ Summary generated!")
249
+ st.subheader("πŸ“„ Document Summary")
250
+ st.write(summary)
251
+ except Exception as e:
252
+ st.error(f"Error: {str(e)}")
253
+
254
+ # Tab 3: Document Retrieval
255
+ with tab3:
256
+ st.subheader("Search and Retrieve Documents")
257
+
258
+ search_query = st.text_input(
259
+ "Search query:",
260
+ placeholder="Enter search terms...",
261
+ key="retrieval_search"
262
+ )
263
+
264
+ col1, col2 = st.columns(2)
265
+ with col1:
266
+ search_btn = st.button("πŸ”Ž Search", use_container_width=True)
267
+ with col2:
268
+ clear_search_btn = st.button("Clear Search", use_container_width=True)
269
+
270
+ if clear_search_btn:
271
+ st.rerun()
272
+
273
+ if search_btn:
274
+ if search_query.strip():
275
+ with st.spinner("⏳ Searching..."):
276
+ try:
277
+ results = pipeline.retrieve_documents(search_query, n_results=n_context_docs)
278
+ except Exception as e:
279
+ st.error(f"Search error: {str(e)}")
280
+ results = []
281
+
282
+ if results:
283
+ st.success(f"βœ“ Found {len(results)} documents")
284
+
285
+ for i, doc in enumerate(results, 1):
286
+ with st.expander(f"πŸ“„ Document {i} - {doc['source']}", expanded=(i==1)):
287
+ st.write(doc["content"])
288
+ else:
289
+ st.warning("No documents found matching your query")
290
+ else:
291
+ st.warning("Please enter a search query")
292
+
293
+ # Footer
294
+ st.divider()
295
+ with st.expander("ℹ️ System Information"):
296
+ info = pipeline.vector_store.get_collection_info()
297
+ col1, col2, col3, col4 = st.columns(4)
298
+ with col1:
299
+ st.metric("πŸ“š Chunks", info.get("document_count", 0))
300
+ with col2:
301
+ st.metric("πŸ–₯️ Device", device.upper())
302
+ with col3:
303
+ st.metric("πŸ” Context", n_context_docs)
304
+ with col4:
305
+ pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
306
+ st.metric("πŸ“ PDFs", pdf_count)