dnj0 commited on
Commit
893bbbd
Β·
verified Β·
1 Parent(s): d657efd

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +466 -223
src/app.py CHANGED
@@ -1,223 +1,466 @@
1
- import streamlit as st
2
- import os
3
- from pathlib import Path
4
- from dotenv import load_dotenv
5
-
6
- # Load environment variables
7
- load_dotenv()
8
-
9
- # Import custom modules
10
- from pdf_processor import PDFProcessor, prepare_documents_for_embedding
11
- from embeddings_handler import CLIPLangChainEmbeddings
12
- from vectorstore_manager import VectorStoreManager
13
- from image_summarizer import ImageSummarizer, process_images_in_documents
14
- from rag_chain import RAGChain
15
- from langchain_core.documents import Document
16
-
17
- # Page configuration
18
- st.set_page_config(
19
- page_title="Multimodal RAG Assistant",
20
- page_icon="πŸ“„",
21
- layout="wide",
22
- initial_sidebar_state="expanded"
23
- )
24
-
25
- st.markdown("""
26
- <style>
27
- .main {
28
- padding: 2rem;
29
- }
30
- .stChatMessage {
31
- padding: 1rem;
32
- border-radius: 0.5rem;
33
- margin-bottom: 1rem;
34
- }
35
- </style>
36
- """, unsafe_allow_html=True)
37
-
38
- # Initialize session state
39
- if "vector_store" not in st.session_state:
40
- st.session_state.vector_store = None
41
- if "rag_chain" not in st.session_state:
42
- st.session_state.rag_chain = None
43
- if "document_count" not in st.session_state:
44
- st.session_state.document_count = 0
45
-
46
- # Sidebar configuration
47
- st.sidebar.title("βš™οΈ Configuration")
48
- st.sidebar.markdown("---")
49
-
50
- # OpenAI API Key
51
- api_key = st.sidebar.text_input(
52
- "OpenAI API Key",
53
- type="password",
54
- value=os.getenv("OPENAI_API_KEY", ""),
55
- help="Enter your OpenAI API key"
56
- )
57
-
58
- if api_key:
59
- os.environ["OPENAI_API_KEY"] = api_key
60
-
61
- # PDF directory setup
62
- pdf_dir = st.sidebar.text_input(
63
- "PDF Directory",
64
- value="./pdfs",
65
- help="Directory containing PDF files"
66
- )
67
-
68
- # Vector store settings
69
- st.sidebar.markdown("### Vector Store")
70
- collection_name = st.sidebar.text_input(
71
- "Collection Name",
72
- value="pdf_documents",
73
- help="ChromaDB collection name"
74
- )
75
-
76
- persist_dir = st.sidebar.text_input(
77
- "Persist Directory",
78
- value="./chroma_db",
79
- help="Directory for ChromaDB storage"
80
- )
81
-
82
- # Initialize vector store button
83
- if st.sidebar.button("πŸ”„ Initialize Vector Store", use_container_width=True):
84
- with st.spinner("Initializing vector store..."):
85
- try:
86
- # Initialize embeddings
87
- embeddings = CLIPLangChainEmbeddings(
88
- model_name="ViT-B-32",
89
- pretrained="openai"
90
- )
91
-
92
- # Initialize vector store
93
- st.session_state.vector_store = VectorStoreManager(
94
- persist_dir=persist_dir,
95
- collection_name=collection_name,
96
- embeddings=embeddings
97
- )
98
-
99
- # Initialize RAG chain
100
- retriever = st.session_state.vector_store.get_retriever()
101
- st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
102
-
103
- st.session_state.document_count = st.session_state.vector_store.collection_count()
104
- st.success("βœ… Vector store initialized!")
105
-
106
- except Exception as e:
107
- st.error(f"❌ Error initializing vector store: {str(e)}")
108
-
109
- # Load and process PDFs button
110
- if st.sidebar.button("πŸ“₯ Load & Process PDFs", use_container_width=True):
111
- if not api_key:
112
- st.error("Please enter OpenAI API Key first")
113
- elif st.session_state.vector_store is None:
114
- st.error("Please initialize vector store first")
115
- else:
116
- with st.spinner("Processing PDFs..."):
117
- try:
118
- # Process PDFs
119
- pdf_processor = PDFProcessor(pdf_dir=pdf_dir)
120
- documents_data = pdf_processor.process_all_pdfs()
121
-
122
- if not documents_data:
123
- st.warning(f"No PDFs found in {pdf_dir}")
124
- else:
125
- # Summarize images
126
- image_summarizer = ImageSummarizer(api_key=api_key)
127
- documents_data = process_images_in_documents(
128
- documents_data,
129
- image_summarizer
130
- )
131
-
132
- # Prepare documents for embedding
133
- all_documents = []
134
- for doc_data in documents_data:
135
- doc_tuples = prepare_documents_for_embedding(doc_data)
136
- for text, metadata in doc_tuples:
137
- all_documents.append(
138
- Document(page_content=text, metadata=metadata)
139
- )
140
-
141
- # Add to vector store
142
- st.session_state.vector_store.add_documents(all_documents)
143
- st.session_state.document_count = st.session_state.vector_store.collection_count()
144
-
145
- # Reinitialize RAG chain
146
- retriever = st.session_state.vector_store.get_retriever()
147
- st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
148
-
149
- st.success(f"βœ… Processed {len(documents_data)} PDFs with {len(all_documents)} chunks")
150
- st.info(f"Total documents in store: {st.session_state.document_count}")
151
-
152
- except Exception as e:
153
- st.error(f"❌ Error processing PDFs: {str(e)}")
154
-
155
- # Display vector store status
156
- st.sidebar.markdown("### Status")
157
- if st.session_state.vector_store:
158
- doc_count = st.session_state.vector_store.collection_count()
159
- st.sidebar.success(f"βœ… Vector Store Ready")
160
- st.sidebar.metric("Documents in Store", doc_count)
161
- else:
162
- st.sidebar.warning("⚠️ Vector Store Not Initialized")
163
-
164
- # Main content area
165
- st.title("πŸ“„ Multimodal PDF RAG Assistant")
166
- st.markdown("Ask questions about your PDF documents. Responses will be provided in Russian.")
167
-
168
- # Check if system is ready
169
- if st.session_state.rag_chain is None:
170
- st.info("""
171
- ### Getting Started:
172
- 1. Enter your OpenAI API Key in the sidebar
173
- 2. Click "Initialize Vector Store"
174
- 3. Place PDF files in the configured directory
175
- 4. Click "Load & Process PDFs"
176
- 5. Ask questions in the chat below
177
- """)
178
- else:
179
- # Chat interface
180
- st.markdown("---")
181
- st.markdown("### Ask a Question")
182
-
183
- col1, col2 = st.columns([1, 0.15])
184
-
185
- with col1:
186
- user_question = st.text_input(
187
- "Your question:",
188
- placeholder="Ask about your documents...",
189
- label_visibility="collapsed"
190
- )
191
-
192
- with col2:
193
- search_button = st.button("πŸ” Search", use_container_width=True)
194
-
195
- # Process question
196
- if search_button and user_question:
197
- with st.spinner("πŸ€– Searching documents and generating response..."):
198
- try:
199
- result = st.session_state.rag_chain.query(user_question)
200
-
201
- # Display answer
202
- st.markdown("### Answer")
203
- st.markdown(result["answer"])
204
-
205
- # Display sources
206
- if result["sources"]:
207
- st.markdown("### Sources")
208
- for i, source in enumerate(result["sources"], 1):
209
- with st.expander(f"Source {i} - {source['metadata'].get('filename', 'Unknown')}"):
210
- st.markdown(f"**Type:** {source['metadata'].get('type', 'Unknown')}")
211
- st.markdown(f"**Page:** {source['metadata'].get('page', 'Unknown')}")
212
- st.markdown(f"**Content:** {source['content']}")
213
-
214
- except Exception as e:
215
- st.error(f"Error processing question: {str(e)}")
216
-
217
- # Footer
218
- st.markdown("---")
219
- st.markdown("""
220
- <div style="text-align: center; color: gray; font-size: 0.8rem;">
221
- Powered by LangChain, ChromaDB, CLIP, and OpenAI
222
- </div>
223
- """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app_with_upload_simple.py
2
+
3
+ import streamlit as st
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from datetime import datetime
8
+ import base64
9
+
10
+ # Setup logging
11
+ logging.getLogger("pdfminer").setLevel(logging.ERROR)
12
+
13
+ from pdf_processor import PDFProcessor, prepare_documents_for_embedding
14
+ from embeddings_handler import CLIPLangChainEmbeddings
15
+ from vectorstore_manager import VectorStoreManager
16
+ from rag_chain import RAGChain
17
+ from langchain_core.documents import Document
18
+
19
+ # ============================================================================
20
+ # PAGE CONFIGURATION
21
+ # ============================================================================
22
+
23
+ st.set_page_config(
24
+ page_title="Multimodal RAG Assistant",
25
+ page_icon="πŸ“„",
26
+ layout="wide",
27
+ initial_sidebar_state="expanded"
28
+ )
29
+
30
+ # Custom CSS
31
+ st.markdown("""
32
+ <style>
33
+ .main { padding: 2rem; }
34
+ .stTabs [data-baseweb="tab-list"] { gap: 2rem; }
35
+ .metric-card { background-color: #f8f9fa; padding: 15px; border-radius: 5px; }
36
+ </style>
37
+ """, unsafe_allow_html=True)
38
+
39
+ # ============================================================================
40
+ # SESSION STATE INITIALIZATION
41
+ # ============================================================================
42
+
43
+ if "processor" not in st.session_state:
44
+ st.session_state.processor = None
45
+
46
+ if "vector_store" not in st.session_state:
47
+ st.session_state.vector_store = None
48
+
49
+ if "rag_chain" not in st.session_state:
50
+ st.session_state.rag_chain = None
51
+
52
+ if "embeddings" not in st.session_state:
53
+ st.session_state.embeddings = None
54
+
55
+ if "documents_processed" not in st.session_state:
56
+ st.session_state.documents_processed = 0
57
+
58
+ if "extracted_content" not in st.session_state:
59
+ st.session_state.extracted_content = []
60
+
61
+ # ============================================================================
62
+ # HELPER FUNCTIONS
63
+ # ============================================================================
64
+
65
+ @st.cache_resource
66
+ def init_processor(pdf_dir="./pdfs"):
67
+ """Initialize PDF processor."""
68
+ return PDFProcessor(pdf_dir=pdf_dir)
69
+
70
+ @st.cache_resource
71
+ def init_embeddings():
72
+ """Initialize CLIP embeddings."""
73
+ return CLIPLangChainEmbeddings(model_name="ViT-B-32", pretrained="openai")
74
+
75
+ @st.cache_resource
76
+ def init_vector_store(embeddings):
77
+ """Initialize vector store."""
78
+ return VectorStoreManager(
79
+ persist_dir="./chroma_db",
80
+ collection_name="pdf_documents",
81
+ embeddings=embeddings
82
+ )
83
+
84
+ def save_uploaded_files(uploaded_files, target_dir="./pdfs"):
85
+ """Save uploaded files to directory."""
86
+ os.makedirs(target_dir, exist_ok=True)
87
+ saved_files = []
88
+
89
+ for uploaded_file in uploaded_files:
90
+ filepath = os.path.join(target_dir, uploaded_file.name)
91
+ with open(filepath, "wb") as f:
92
+ f.write(uploaded_file.getbuffer())
93
+ saved_files.append(uploaded_file.name)
94
+
95
+ return saved_files
96
+
97
+ def get_document_stats(content):
98
+ """Get statistics from extracted content."""
99
+ stats = {
100
+ "pages": len(content.get("pages", [])),
101
+ "total_text": sum(len(p.get("text", "")) for p in content.get("pages", [])),
102
+ "tables": sum(len(p.get("tables", [])) for p in content.get("pages", [])),
103
+ "images": sum(len(p.get("images", [])) for p in content.get("pages", []))
104
+ }
105
+ return stats
106
+
107
+ # ============================================================================
108
+ # MAIN APP
109
+ # ============================================================================
110
+
111
+ st.title("πŸ“„ Multimodal PDF RAG Assistant")
112
+ st.markdown("Upload PDFs, extract content, and query with multimodal embeddings.")
113
+
114
+ # ============================================================================
115
+ # SIDEBAR - CONFIGURATION & UPLOAD
116
+ # ============================================================================
117
+
118
+ with st.sidebar:
119
+ st.header("βš™οΈ Configuration & Upload")
120
+
121
+ # API Key
122
+ api_key = st.text_input(
123
+ "OpenAI API Key",
124
+ type="password",
125
+ value=os.getenv("OPENAI_API_KEY", ""),
126
+ help="Your OpenAI API key"
127
+ )
128
+
129
+ if api_key:
130
+ os.environ["OPENAI_API_KEY"] = api_key
131
+
132
+ st.markdown("---")
133
+
134
+ # PDF Upload Section
135
+ st.markdown("### πŸ“€ Upload PDFs")
136
+
137
+ uploaded_pdfs = st.file_uploader(
138
+ "Choose PDF files",
139
+ type="pdf",
140
+ accept_multiple_files=True,
141
+ key="pdf_uploader",
142
+ help="Upload one or more PDF files"
143
+ )
144
+
145
+ if uploaded_pdfs:
146
+ st.info(f"πŸ“¦ {len(uploaded_pdfs)} file(s) selected")
147
+
148
+ if st.button("πŸ’Ύ Save & Process PDFs", use_container_width=True):
149
+ # Save files
150
+ with st.spinner("πŸ“₯ Saving files..."):
151
+ saved_files = save_uploaded_files(uploaded_pdfs)
152
+ st.success(f"βœ… Saved {len(saved_files)} file(s)")
153
+
154
+ # Initialize processor
155
+ with st.spinner("πŸ”„ Initializing processor..."):
156
+ processor = init_processor()
157
+ st.session_state.processor = processor
158
+
159
+ # Process PDFs
160
+ with st.spinner("πŸ“– Processing PDFs..."):
161
+ documents = processor.process_all_pdfs()
162
+ st.session_state.extracted_content = documents
163
+ st.session_state.documents_processed = len(documents)
164
+
165
+ # Prepare chunks for embedding
166
+ all_chunks = []
167
+ for doc_content in documents:
168
+ chunks = prepare_documents_for_embedding(doc_content)
169
+ all_chunks.extend(chunks)
170
+
171
+ st.success(f"βœ… Processed {len(documents)} PDF(s), {len(all_chunks)} chunks")
172
+
173
+ # Initialize embeddings and vector store
174
+ with st.spinner("πŸ”— Creating vector store..."):
175
+ embeddings = init_embeddings()
176
+ st.session_state.embeddings = embeddings
177
+
178
+ vector_store = init_vector_store(embeddings)
179
+ st.session_state.vector_store = vector_store
180
+
181
+ # Add documents to vector store
182
+ docs_for_store = [
183
+ Document(page_content=text, metadata=meta)
184
+ for text, meta in all_chunks
185
+ ]
186
+ vector_store.add_documents(docs_for_store)
187
+
188
+ # Initialize RAG chain
189
+ retriever = vector_store.get_retriever()
190
+ rag_chain = RAGChain(retriever, api_key=api_key)
191
+ st.session_state.rag_chain = rag_chain
192
+
193
+ st.success("βœ… Ready to query!")
194
+
195
+ st.markdown("---")
196
+
197
+ # Status
198
+ st.markdown("### πŸ“Š Status")
199
+
200
+ if st.session_state.documents_processed > 0:
201
+ st.metric("Documents Processed", st.session_state.documents_processed)
202
+
203
+ total_pages = sum(
204
+ len(doc.get("pages", []))
205
+ for doc in st.session_state.extracted_content
206
+ )
207
+ st.metric("Total Pages", total_pages)
208
+
209
+ total_images = sum(
210
+ sum(len(p.get("images", [])) for p in doc.get("pages", []))
211
+ for doc in st.session_state.extracted_content
212
+ )
213
+ st.metric("Total Images", total_images)
214
+ else:
215
+ st.info("Upload and process PDFs to get started")
216
+
217
+ # ============================================================================
218
+ # MAIN CONTENT AREA - TABS
219
+ # ============================================================================
220
+
221
+ if st.session_state.documents_processed == 0:
222
+ st.warning("πŸ‘ˆ Upload PDFs in the sidebar to get started")
223
+ else:
224
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ” Query", "πŸ“Š Documents", "πŸ–ΌοΈ Images", "ℹ️ Info"])
225
+
226
+ # ====================================================================
227
+ # TAB 1: QUERY
228
+ # ====================================================================
229
+
230
+ with tab1:
231
+ st.header("πŸ” Ask Questions")
232
+ st.markdown("Ask questions about your PDF documents.")
233
+
234
+ if st.session_state.rag_chain is None:
235
+ st.warning("⚠️ Please process PDFs first using the sidebar.")
236
+ else:
237
+ col1, col2 = st.columns([5, 1])
238
+
239
+ with col1:
240
+ user_query = st.text_input(
241
+ "Your question:",
242
+ placeholder="What is this document about?",
243
+ label_visibility="collapsed"
244
+ )
245
+
246
+ with col2:
247
+ search_button = st.button("πŸ” Search", use_container_width=True)
248
+
249
+ if search_button and user_query:
250
+ with st.spinner("πŸ€– Searching and generating response..."):
251
+ try:
252
+ result = st.session_state.rag_chain.query(user_query)
253
+
254
+ # Display answer
255
+ st.markdown("### πŸ“ Answer")
256
+ st.markdown(result["answer"])
257
+
258
+ # Display sources
259
+ if result["sources"]:
260
+ st.markdown("### πŸ“š Sources")
261
+ for i, source in enumerate(result["sources"], 1):
262
+ with st.expander(f"Source {i} - {source['metadata'].get('filename', 'Unknown')}"):
263
+ st.markdown(f"**Type:** {source['metadata'].get('type', 'Unknown')}")
264
+ st.markdown(f"**Page:** {source['metadata'].get('page', 'Unknown')}")
265
+ st.markdown(f"**Content:** {source['content'][:500]}...")
266
+
267
+ except Exception as e:
268
+ st.error(f"❌ Error: {str(e)}")
269
+
270
+ # ====================================================================
271
+ # TAB 2: DOCUMENTS
272
+ # ====================================================================
273
+
274
+ with tab2:
275
+ st.header("πŸ“Š Processed Documents")
276
+
277
+ if not st.session_state.extracted_content:
278
+ st.info("No documents processed yet.")
279
+ else:
280
+ # Overall statistics
281
+ col1, col2, col3, col4 = st.columns(4)
282
+
283
+ with col1:
284
+ st.metric("Documents", len(st.session_state.extracted_content))
285
+
286
+ with col2:
287
+ total_pages = sum(
288
+ len(doc.get("pages", []))
289
+ for doc in st.session_state.extracted_content
290
+ )
291
+ st.metric("Pages", total_pages)
292
+
293
+ with col3:
294
+ total_images = sum(
295
+ sum(len(p.get("images", [])) for p in doc.get("pages", []))
296
+ for doc in st.session_state.extracted_content
297
+ )
298
+ st.metric("Images", total_images)
299
+
300
+ with col4:
301
+ total_tables = sum(
302
+ sum(len(p.get("tables", [])) for p in doc.get("pages", []))
303
+ for doc in st.session_state.extracted_content
304
+ )
305
+ st.metric("Tables", total_tables)
306
+
307
+ st.markdown("---")
308
+
309
+ # Document details
310
+ st.markdown("### πŸ“„ Document Details")
311
+
312
+ for idx, doc in enumerate(st.session_state.extracted_content, 1):
313
+ filename = doc.get("filename", f"Document {idx}")
314
+ stats = get_document_stats(doc)
315
+
316
+ with st.expander(f"πŸ“‘ {filename}"):
317
+ col1, col2, col3, col4 = st.columns(4)
318
+
319
+ with col1:
320
+ st.metric("Pages", stats["pages"])
321
+ with col2:
322
+ st.metric("Images", stats["images"])
323
+ with col3:
324
+ st.metric("Tables", stats["tables"])
325
+ with col4:
326
+ st.metric("Text (KB)", round(stats["total_text"] / 1024, 1))
327
+
328
+ # Preview pages
329
+ st.markdown("#### First 3 Pages Preview:")
330
+ for page in doc.get("pages", [])[:3]:
331
+ page_num = page.get("page_number")
332
+ text = page.get("text", "")[:200]
333
+ st.write(f"**Page {page_num}:** {text}...")
334
+
335
+ # ====================================================================
336
+ # TAB 3: IMAGES
337
+ # ====================================================================
338
+
339
+ with tab3:
340
+ st.header("πŸ–ΌοΈ Extracted Images")
341
+
342
+ if not st.session_state.extracted_content:
343
+ st.info("No images extracted yet.")
344
+ else:
345
+ image_count = 0
346
+
347
+ for doc_idx, doc in enumerate(st.session_state.extracted_content, 1):
348
+ filename = doc.get("filename", f"Document {doc_idx}")
349
+
350
+ for page in doc.get("pages", []):
351
+ page_num = page.get("page_number")
352
+ images = page.get("images", [])
353
+
354
+ if images:
355
+ st.markdown(f"### πŸ“„ {filename} - Page {page_num}")
356
+
357
+ img_cols = st.columns(min(len(images), 2))
358
+
359
+ for idx, image in enumerate(images):
360
+ with img_cols[idx % 2]:
361
+ # Try to display image
362
+ if image.get("base64"):
363
+ try:
364
+ st.image(
365
+ f"data:image/{image.get('format', 'png')};base64,{image.get('base64')}",
366
+ caption=f"Image {image.get('index')}",
367
+ use_column_width=True
368
+ )
369
+ image_count += 1
370
+ except Exception as e:
371
+ st.warning(f"Could not display image: {e}")
372
+ else:
373
+ st.warning("No image data available")
374
+
375
+ if image_count == 0:
376
+ st.info("No images were successfully extracted from the PDFs.")
377
+
378
+ # ====================================================================
379
+ # TAB 4: INFO
380
+ # ====================================================================
381
+
382
+ with tab4:
383
+ st.header("ℹ️ System Information")
384
+
385
+ st.markdown("### 🎯 Features")
386
+
387
+ features = {
388
+ "βœ… PDF Upload": "Upload multiple PDFs via UI",
389
+ "βœ… Text Extraction": "Extract text from documents",
390
+ "βœ… Table Detection": "Identify and extract tables",
391
+ "βœ… Image Extraction": "Extract and display images",
392
+ "βœ… CLIP Embeddings": "Multimodal embeddings",
393
+ "βœ… Vector Store": "ChromaDB for similarity search",
394
+ "βœ… RAG Chain": "LangChain with OpenAI",
395
+ "βœ… Russian Support": "Queries answered in Russian",
396
+ }
397
+
398
+ for feature, description in features.items():
399
+ st.markdown(f"**{feature}** - {description}")
400
+
401
+ st.markdown("---")
402
+
403
+ st.markdown("### πŸ“¦ System Status")
404
+
405
+ col1, col2, col3 = st.columns(3)
406
+
407
+ with col1:
408
+ if st.session_state.processor:
409
+ st.success("βœ… Processor Ready")
410
+ else:
411
+ st.warning("⚠️ Processor Not Initialized")
412
+
413
+ with col2:
414
+ if st.session_state.embeddings:
415
+ st.success("βœ… Embeddings Ready")
416
+ else:
417
+ st.warning("⚠️ Embeddings Not Initialized")
418
+
419
+ with col3:
420
+ if st.session_state.rag_chain:
421
+ st.success("βœ… RAG Chain Ready")
422
+ else:
423
+ st.warning("⚠️ RAG Chain Not Initialized")
424
+
425
+ st.markdown("---")
426
+
427
+ st.markdown("### πŸš€ How It Works")
428
+
429
+ st.markdown("""
430
+ 1. **Upload**: Select one or more PDF files
431
+ 2. **Process**: System extracts text, tables, and images
432
+ 3. **Embed**: Content converted to multimodal embeddings
433
+ 4. **Store**: Vectors stored in ChromaDB
434
+ 5. **Query**: Ask questions about documents
435
+ 6. **Retrieve**: Relevant content fetched from store
436
+ 7. **Generate**: OpenAI creates response
437
+ 8. **Display**: Answer and sources shown in UI
438
+ """)
439
+
440
+ st.markdown("---")
441
+
442
+ st.markdown("### πŸ”— Technology Stack")
443
+
444
+ tech_info = {
445
+ "PDF Processing": "PyMuPDF, pdfplumber",
446
+ "Embeddings": "CLIP ViT-B-32 (open-clip-torch)",
447
+ "Vector Store": "ChromaDB",
448
+ "LLM Framework": "LangChain",
449
+ "Language Model": "OpenAI GPT-4o-mini",
450
+ "Web UI": "Streamlit",
451
+ }
452
+
453
+ for tech, details in tech_info.items():
454
+ st.write(f"**{tech}:** {details}")
455
+
456
+ # ============================================================================
457
+ # FOOTER
458
+ # ============================================================================
459
+
460
+ st.markdown("---")
461
+ st.markdown(
462
+ "<div style='text-align: center; color: gray; font-size: 0.8rem;'>"
463
+ "Multimodal RAG LLM System | Powered by LangChain, ChromaDB, CLIP, and OpenAI"
464
+ "</div>",
465
+ unsafe_allow_html=True
466
+ )