dnj0 commited on
Commit
a6680e7
·
1 Parent(s): d993ee2
Files changed (5) hide show
  1. src/app.py +240 -83
  2. src/config.py +20 -9
  3. src/pdf_parser.py +41 -11
  4. src/rag_system.py +155 -53
  5. src/vector_store.py +49 -18
src/app.py CHANGED
@@ -1,25 +1,41 @@
 
 
 
 
 
1
  import streamlit as st
2
  import os
3
  from pathlib import Path
 
 
4
  from pdf_parser import PDFParser
5
  from vector_store import VectorStore
6
- from rag_system import VisualMultimodalRAG, AnsweringRAG
7
  from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
8
 
 
 
 
 
 
9
  st.set_page_config(
10
- page_title="Мультимодальная система RAG LLM",
11
- page_icon="",
12
  layout="wide",
13
  initial_sidebar_state="expanded"
14
  )
15
 
 
 
 
 
16
  if 'api_key_set' not in st.session_state:
17
  st.session_state.api_key_set = False
18
 
19
  if 'api_key' not in st.session_state:
20
  st.session_state.api_key = None
21
 
22
- if 'visual_rag_system' not in st.session_state:
23
  st.session_state.visual_rag_system = None
24
 
25
  if 'vector_store' not in st.session_state:
@@ -40,26 +56,39 @@ if 'current_images' not in st.session_state:
40
  if 'current_tables' not in st.session_state:
41
  st.session_state.current_tables = None
42
 
43
- if 'processing_results' not in st.session_state:
44
  st.session_state.processing_results = None
45
 
46
  if 'answering_rag' not in st.session_state:
47
  st.session_state.answering_rag = None
48
 
49
- st.title("Мультимодальная система RAG LLM")
50
 
51
- st.markdown("""
 
 
52
 
53
- Обработка PDF-документов с анализом визуального контента
 
 
 
 
 
 
54
  """)
55
 
 
 
 
 
 
56
  with st.sidebar:
57
- st.header("Конфигурация")
58
 
59
- st.subheader("Ключ API OpenAI")
 
60
 
61
  api_key = st.text_input(
62
- "Введите ваш ключ API OpenAI:",
63
  type="password",
64
  key="api_key_input"
65
  )
@@ -68,116 +97,154 @@ with st.sidebar:
68
  st.session_state.api_key = api_key
69
  st.session_state.api_key_set = True
70
 
 
71
  if st.session_state.visual_rag_system is None:
72
  try:
73
- st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)
74
  st.session_state.vector_store = VectorStore()
75
  st.session_state.parser = PDFParser(debug=True)
76
- st.success("Ключ API установлен")
77
  except Exception as e:
78
- st.error(f"Ошибка при инициализации систем: {e}")
79
  else:
80
  st.session_state.api_key_set = False
81
- st.warning("Введите ключ API для продолжения")
82
 
83
  st.divider()
84
 
85
- st.subheader("Векторное хранилище")
86
-
87
  if st.session_state.vector_store:
88
  try:
89
  info = st.session_state.vector_store.get_collection_info()
90
- st.metric("Элементов в хранилище", info['count'])
91
- st.caption(f"Путь: {info['persist_path']}")
 
92
  except Exception as e:
93
- st.error(f"Ошибка получения информации о хранилище: {e}")
94
  else:
95
- st.info("Установите ключ API для инициализации векторного хранилища")
96
 
97
  st.divider()
98
 
99
- st.subheader("Управление документами")
100
-
101
- if st.button("Очистить векторное хранилище"):
102
  if st.session_state.vector_store:
103
  try:
104
  st.session_state.vector_store.clear_all()
105
- st.success("Векторное хранилище очищено")
106
  except Exception as e:
107
- st.error(f"Ошибка при очистке хранилища: {e}")
 
108
 
109
- st.header("Загрузка PDF-документа")
 
 
 
 
 
110
 
111
  uploaded_file = st.file_uploader(
112
- "Выберите PDF-файл",
113
  type=['pdf'],
114
- help="PDF с текстом, изображениями и таблицами"
115
  )
116
 
117
  if uploaded_file is not None:
 
118
  upload_path = Path(UPLOAD_FOLDER)
119
  upload_path.mkdir(exist_ok=True)
 
120
  file_path = upload_path / uploaded_file.name
121
  with open(file_path, 'wb') as f:
122
  f.write(uploaded_file.getbuffer())
123
- st.success(f"Файл сохранён: {uploaded_file.name}")
124
 
125
- if st.button("Распарсить PDF"):
 
 
 
126
  if not st.session_state.api_key_set:
127
- st.error("Введите ключ API для продолжения")
128
  else:
129
  try:
130
- with st.spinner("Парсинг PDF..."):
131
- print("PARSING: " + uploaded_file.name)
 
 
 
 
132
  parser = st.session_state.parser
133
  text, images, tables = parser.parse_pdf(str(file_path))
134
 
 
135
  st.session_state.current_document = uploaded_file.name
136
  st.session_state.current_text = text
137
  st.session_state.current_images = images
138
  st.session_state.current_tables = tables
139
 
 
140
  col1, col2, col3 = st.columns(3)
141
  with col1:
142
- st.metric("Текст", f"{len(text):,} символов")
143
  with col2:
144
- st.metric("Изображения", len(images))
145
  with col3:
146
- st.metric("Таблицы", len(tables))
147
 
 
148
  if images:
149
- st.subheader("Извлечённые изображения")
150
  for idx, img in enumerate(images):
151
  ocr_text = img.get('ocr_text', '')
152
  ocr_len = len(ocr_text)
 
153
  if ocr_len > 0:
154
- st.success(f"Изображение {idx}: {ocr_len} символов (OCR)")
155
  else:
156
- st.warning(f"Изображение {idx}: Текст OCR не найден (будет использоваться визуальный анализ)")
 
 
157
 
158
- st.success("Парсинг PDF завершён!")
159
  except Exception as e:
160
- st.error(f"Ошибка при парсинге PDF: {e}")
 
161
 
162
- st.divider()
163
 
164
- st.header("Анализ")
 
 
165
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- if st.button("Анализировать"):
168
  if not st.session_state.api_key_set:
169
- st.error("Введите ключ API для продолжения")
170
  elif st.session_state.current_text is None:
171
- st.error("Распарсьте PDF-документ")
172
  else:
173
  try:
174
- with st.spinner("Анализ изображений с помощью gpt-4o-mini..."):
175
- print("ANALYSIS")
 
 
 
 
176
  visual_rag = st.session_state.visual_rag_system
177
  vector_store = st.session_state.vector_store
 
178
  results = visual_rag.process_and_store_document(
179
  text=st.session_state.current_text,
180
- images=st.session_state.current_images,
181
  tables=st.session_state.current_tables,
182
  vector_store=vector_store,
183
  doc_id=st.session_state.current_document or "current_doc"
@@ -185,97 +252,187 @@ if st.button("Анализировать"):
185
 
186
  st.session_state.processing_results = results
187
 
188
- st.success("Анализ завершён и сохранён!")
 
189
 
190
  col1, col2, col3 = st.columns(3)
191
  with col1:
192
- st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
193
  with col2:
194
- st.metric("Фрагментов текста", len(results['text_summaries']))
195
  with col3:
196
- st.metric("Проанализировано таблиц", len(results['table_summaries']))
197
 
198
- st.metric("Всего сохранено в вектор", results['total_stored'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  except Exception as e:
201
- st.error(f"Ошибка при анализе: {e}")
 
 
 
 
 
 
202
 
203
  st.divider()
 
204
 
205
- st.header("Задать вопрос о документе")
 
 
206
 
 
207
  if st.session_state.api_key_set and st.session_state.answering_rag is None:
 
208
  st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
209
 
210
  question = st.text_area(
211
- "Введите ваш вопрос:",
212
  height=100,
213
- placeholder="О чем говорится в документе?"
214
  )
215
 
216
- if st.button("Поиск и генерация ответа"):
217
  if not st.session_state.api_key_set:
218
- st.error("Введите ключ API для продолжения")
219
  elif st.session_state.current_text is None:
220
- st.error("Распарсьте PDF-документ")
221
  elif not question:
222
- st.error("Введите вопрос")
223
  else:
224
  try:
225
- with st.spinner("Поиск в документе и анализ..."):
226
- print("QUESTION: " + question)
 
 
 
 
227
  store = st.session_state.vector_store
228
 
 
229
  doc_name = st.session_state.current_document or "current_doc"
230
  doc_data = {
231
  'text': st.session_state.current_text,
232
  'images': [],
233
  'tables': []
234
  }
235
-
236
  store.add_documents(doc_data, doc_name)
237
 
 
238
  search_results = store.search(question, n_results=5)
239
 
 
 
 
240
  answering_rag = st.session_state.answering_rag
241
  result = answering_rag.analyze_and_answer(question, search_results)
242
 
243
- st.success("Анализ завершён!")
 
244
 
245
- st.subheader("Ответ")
246
 
 
247
  col1, col2, col3 = st.columns(3)
248
  with col1:
249
- confidence_map = {
250
- 'high': 'ВЫСОКАЯ',
251
- 'medium': 'СРЕДНЯЯ',
252
- 'low': 'НИЗКАЯ'
253
- }
254
- confidence_text = confidence_map.get(result['confidence'], result['confidence'].upper())
255
- st.metric("Уверенность", confidence_text)
256
  with col2:
257
- st.metric("Использовано источников", result['sources_used'])
258
  with col3:
259
  if result['sources_used'] > 0:
260
- st.metric("Сред. релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
261
 
 
262
  st.write(result['answer'])
263
 
264
- if st.checkbox("Показать исходные документы"):
265
- st.subheader("Источники, использованные в ответе")
266
- for source in result.get('formatted_sources', []):
 
267
  relevance = source['relevance']
268
  relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
 
269
  with st.expander(
270
- f"Источник {source['index']} - {source['type'].upper()} "
271
  f"[{relevance_bar}] {relevance:.0%}"
272
  ):
273
  st.write(source['content'])
 
 
 
274
  except Exception as e:
275
- st.error(f"Ошибка при обработке вопроса: {e}")
 
 
 
 
 
 
276
 
277
  st.divider()
278
 
 
 
 
 
 
 
 
 
 
 
 
279
  st.caption(
280
- "Мультимодальная система RAG"
 
 
 
281
  )
 
1
+ """
2
+ Multimodal RAG LLM System - Streamlit App
3
+ Complete working version with VISUAL image analysis using gpt-4o
4
+ """
5
+
6
  import streamlit as st
7
  import os
8
  from pathlib import Path
9
+
10
+ # Import optimized versions
11
  from pdf_parser import PDFParser
12
  from vector_store import VectorStore
13
+ from rag_system import VisualMultimodalRAG # NEW - Vision model
14
  from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
15
 
16
+
17
+ # ============================================================================
18
+ # PAGE CONFIGURATION
19
+ # ============================================================================
20
+
21
  st.set_page_config(
22
+ page_title="📄 Multimodal RAG LLM System",
23
+ page_icon="🤖",
24
  layout="wide",
25
  initial_sidebar_state="expanded"
26
  )
27
 
28
+ # ============================================================================
29
+ # SESSION STATE INITIALIZATION
30
+ # ============================================================================
31
+
32
  if 'api_key_set' not in st.session_state:
33
  st.session_state.api_key_set = False
34
 
35
  if 'api_key' not in st.session_state:
36
  st.session_state.api_key = None
37
 
38
+ if 'visual_rag_system' not in st.session_state: # NEW - Vision model
39
  st.session_state.visual_rag_system = None
40
 
41
  if 'vector_store' not in st.session_state:
 
56
  if 'current_tables' not in st.session_state:
57
  st.session_state.current_tables = None
58
 
59
+ if 'processing_results' not in st.session_state: # NEW
60
  st.session_state.processing_results = None
61
 
62
  if 'answering_rag' not in st.session_state:
63
  st.session_state.answering_rag = None
64
 
 
65
 
66
+ # ============================================================================
67
+ # MAIN HEADER
68
+ # ============================================================================
69
 
70
+ st.title("📄 Multimodal RAG LLM System")
71
+ st.markdown("""
72
+ Process PDF documents with visual image analysis:
73
+ - **PDF Parser** with OCR for Russian & English
74
+ - **Visual Analysis** (gpt-4o) for image understanding
75
+ - **Vector Store** (ChromaDB) for semantic search
76
+ - **Individual Component** summarization and storage
77
  """)
78
 
79
+
80
+ # ============================================================================
81
+ # SIDEBAR - CONFIGURATION
82
+ # ============================================================================
83
+
84
  with st.sidebar:
85
+ st.header("⚙️ Configuration")
86
 
87
+ # API Key Section
88
+ st.subheader("🔑 OpenAI API Key")
89
 
90
  api_key = st.text_input(
91
+ "Enter your OpenAI API key:",
92
  type="password",
93
  key="api_key_input"
94
  )
 
97
  st.session_state.api_key = api_key
98
  st.session_state.api_key_set = True
99
 
100
+ # Initialize RAG systems if not already done
101
  if st.session_state.visual_rag_system is None:
102
  try:
103
+ st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW
104
  st.session_state.vector_store = VectorStore()
105
  st.session_state.parser = PDFParser(debug=True)
106
+ st.success(" API Key set & systems initialized")
107
  except Exception as e:
108
+ st.error(f"Error initializing systems: {e}")
109
  else:
110
  st.session_state.api_key_set = False
111
+ st.warning("⚠️ Please enter your API key to continue")
112
 
113
  st.divider()
114
 
115
+ # Vector Store Status
116
+ st.subheader("📊 Vector Store Status")
117
  if st.session_state.vector_store:
118
  try:
119
  info = st.session_state.vector_store.get_collection_info()
120
+ st.metric("Items in Store", info['count'])
121
+ st.metric("Status", info['status'])
122
+ st.caption(f"Path: {info['persist_path']}")
123
  except Exception as e:
124
+ st.error(f"Error getting store info: {e}")
125
  else:
126
+ st.info("Set API key to initialize vector store")
127
 
128
  st.divider()
129
 
130
+ # Document Management
131
+ st.subheader("📁 Document Management")
132
+ if st.button("🔄 Clear Vector Store"):
133
  if st.session_state.vector_store:
134
  try:
135
  st.session_state.vector_store.clear_all()
136
+ st.success(" Vector store cleared")
137
  except Exception as e:
138
+ st.error(f"Error clearing store: {e}")
139
+
140
 
141
+ # ============================================================================
142
+ # MAIN CONTENT
143
+ # ============================================================================
144
+
145
+ # Upload Section
146
+ st.header("📤 Upload PDF Document")
147
 
148
  uploaded_file = st.file_uploader(
149
+ "Choose a PDF file",
150
  type=['pdf'],
151
+ help="PDF with text, images, and tables"
152
  )
153
 
154
  if uploaded_file is not None:
155
+ # Save uploaded file
156
  upload_path = Path(UPLOAD_FOLDER)
157
  upload_path.mkdir(exist_ok=True)
158
+
159
  file_path = upload_path / uploaded_file.name
160
  with open(file_path, 'wb') as f:
161
  f.write(uploaded_file.getbuffer())
 
162
 
163
+ st.success(f" File saved: {uploaded_file.name}")
164
+
165
+ # Parse PDF
166
+ if st.button("🔍 Parse PDF"):
167
  if not st.session_state.api_key_set:
168
+ st.error(" Please set OpenAI API key first")
169
  else:
170
  try:
171
+ with st.spinner("📄 Parsing PDF..."):
172
+ print(f"\n{'='*70}")
173
+ print(f"PARSING: {uploaded_file.name}")
174
+ print(f"{'='*70}")
175
+
176
+ # Parse PDF - returns text, images, tables
177
  parser = st.session_state.parser
178
  text, images, tables = parser.parse_pdf(str(file_path))
179
 
180
+ # Store in session state
181
  st.session_state.current_document = uploaded_file.name
182
  st.session_state.current_text = text
183
  st.session_state.current_images = images
184
  st.session_state.current_tables = tables
185
 
186
+ # Display results
187
  col1, col2, col3 = st.columns(3)
188
  with col1:
189
+ st.metric("📝 Text", f"{len(text):,} chars")
190
  with col2:
191
+ st.metric("🖼️ Images", len(images))
192
  with col3:
193
+ st.metric("📋 Tables", len(tables))
194
 
195
+ # Show image OCR details
196
  if images:
197
+ st.subheader("🖼️ Extracted Images")
198
  for idx, img in enumerate(images):
199
  ocr_text = img.get('ocr_text', '')
200
  ocr_len = len(ocr_text)
201
+
202
  if ocr_len > 0:
203
+ st.success(f" Image {idx}: {ocr_len} characters (OCR)")
204
  else:
205
+ st.warning(f"⚠️ Image {idx}: No OCR text (will use visual analysis)")
206
+
207
+ st.success("✅ PDF parsing complete!")
208
 
 
209
  except Exception as e:
210
+ st.error(f" Error parsing PDF: {e}")
211
+ print(f"Error: {e}")
212
 
 
213
 
214
+ # ============================================================================
215
+ # VISUAL IMAGE ANALYSIS & COMPONENT STORAGE
216
+ # ============================================================================
217
 
218
+ st.divider()
219
+ st.header("🖼️ Visual Analysis & Storage")
220
+
221
+ st.info("""
222
+ **How it works:**
223
+ 1. Images are sent to gpt-4o for visual analysis (not just text OCR)
224
+ 2. Text is split into chunks and each chunk is summarized
225
+ 3. Tables are analyzed individually
226
+ 4. ALL summaries are stored in the vector store for semantic search
227
+ """)
228
 
229
+ if st.button("🖼️ Analyze Images Visually & Store Components"):
230
  if not st.session_state.api_key_set:
231
+ st.error(" Please set OpenAI API key first")
232
  elif st.session_state.current_text is None:
233
+ st.error(" Please parse a PDF document first")
234
  else:
235
  try:
236
+ with st.spinner("🖼️ Analyzing images visually with gpt-4o..."):
237
+ print(f"\n{'='*70}")
238
+ print(f"VISUAL IMAGE ANALYSIS")
239
+ print(f"{'='*70}")
240
+
241
+ # Process with visual analysis
242
  visual_rag = st.session_state.visual_rag_system
243
  vector_store = st.session_state.vector_store
244
+
245
  results = visual_rag.process_and_store_document(
246
  text=st.session_state.current_text,
247
+ images=st.session_state.current_images, # Actual images sent to gpt-4o
248
  tables=st.session_state.current_tables,
249
  vector_store=vector_store,
250
  doc_id=st.session_state.current_document or "current_doc"
 
252
 
253
  st.session_state.processing_results = results
254
 
255
+ # Display results
256
+ st.success("✅ Visual analysis complete & stored!")
257
 
258
  col1, col2, col3 = st.columns(3)
259
  with col1:
260
+ st.metric("🖼️ Images Analyzed", len(results['image_visual_analyses']))
261
  with col2:
262
+ st.metric("📝 Text Chunks", len(results['text_summaries']))
263
  with col3:
264
+ st.metric("📋 Tables Analyzed", len(results['table_summaries']))
265
 
266
+ st.metric("📊 Total Stored in Vector", results['total_stored'])
267
+
268
+ # Show image visual analyses
269
+ if results['image_visual_analyses']:
270
+ st.subheader("🖼️ Visual Image Analyses (gpt-4o)")
271
+ for img_analysis in results['image_visual_analyses']:
272
+ with st.expander(f"Image {img_analysis['image_index']} - Visual Analysis"):
273
+ st.write("**Visual Analysis by gpt-4o:**")
274
+ st.write(img_analysis['visual_analysis'])
275
+
276
+ st.write("**Image Path:**")
277
+ st.code(img_analysis['image_path'])
278
+
279
+ if img_analysis['ocr_text']:
280
+ st.write("**OCR Text (backup):**")
281
+ st.text(img_analysis['ocr_text'][:500])
282
+
283
+ # Show text chunk summaries
284
+ if results['text_summaries']:
285
+ st.subheader("📝 Text Chunk Summaries")
286
+ for chunk_summary in results['text_summaries']:
287
+ with st.expander(
288
+ f"Chunk {chunk_summary['chunk_index']} "
289
+ f"({chunk_summary['chunk_length']} chars)"
290
+ ):
291
+ st.write("**Summary:**")
292
+ st.write(chunk_summary['summary'])
293
+ st.write("**Original Text (first 500 chars):**")
294
+ st.text(chunk_summary['original_text'])
295
+
296
+ # Show table analyses
297
+ if results['table_summaries']:
298
+ st.subheader("📋 Table Analyses")
299
+ for table_summary in results['table_summaries']:
300
+ with st.expander(
301
+ f"Table {table_summary['table_index']} "
302
+ f"({table_summary['table_length']} chars)"
303
+ ):
304
+ st.write("**Analysis:**")
305
+ st.write(table_summary['summary'])
306
+ st.write("**Original Content (first 500 chars):**")
307
+ st.text(table_summary['original_content'])
308
+
309
+ print(f"\n✅ Visual analysis processing complete!")
310
 
311
  except Exception as e:
312
+ st.error(f" Error during visual analysis: {e}")
313
+ print(f"Error: {e}")
314
+
315
+
316
+ # ============================================================================
317
+ # QUESTION & ANSWERING
318
+ # ============================================================================
319
 
320
  st.divider()
321
+ st.header("❓ Ask Questions About Document")
322
 
323
+ # Initialize answering system if not done
324
+ if 'answering_rag' not in st.session_state:
325
+ st.session_state.answering_rag = None
326
 
327
+ # Create answering system when API key is set
328
  if st.session_state.api_key_set and st.session_state.answering_rag is None:
329
+ from rag_system import AnsweringRAG
330
  st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
331
 
332
  question = st.text_area(
333
+ "Enter your question:",
334
  height=100,
335
+ placeholder="What does the document say about...?"
336
  )
337
 
338
+ if st.button("🔍 Search & Generate Answer"):
339
  if not st.session_state.api_key_set:
340
+ st.error(" Please set OpenAI API key first")
341
  elif st.session_state.current_text is None:
342
+ st.error(" Please parse a PDF document first")
343
  elif not question:
344
+ st.error(" Please enter a question")
345
  else:
346
  try:
347
+ with st.spinner("🔄 Searching document and analyzing..."):
348
+ print(f"\n{'='*70}")
349
+ print(f"QUESTION: {question}")
350
+ print(f"{'='*70}")
351
+
352
+ # Search vector store
353
  store = st.session_state.vector_store
354
 
355
+ # Add documents to store if needed
356
  doc_name = st.session_state.current_document or "current_doc"
357
  doc_data = {
358
  'text': st.session_state.current_text,
359
  'images': [],
360
  'tables': []
361
  }
 
362
  store.add_documents(doc_data, doc_name)
363
 
364
+ # Search for relevant results
365
  search_results = store.search(question, n_results=5)
366
 
367
+ print(f"\n📊 Search Results Found: {len(search_results)}")
368
+
369
+ # Analyze results and generate answer
370
  answering_rag = st.session_state.answering_rag
371
  result = answering_rag.analyze_and_answer(question, search_results)
372
 
373
+ # Display answer prominently
374
+ st.success("✅ Analysis complete!")
375
 
376
+ st.subheader("📝 Answer")
377
 
378
+ # Show confidence level
379
  col1, col2, col3 = st.columns(3)
380
  with col1:
381
+ confidence_color = {
382
+ 'high': '🟢',
383
+ 'medium': '🟡',
384
+ 'low': '🔴'
385
+ }.get(result['confidence'], '⚪')
386
+ st.metric("Confidence", f"{confidence_color} {result['confidence'].upper()}")
 
387
  with col2:
388
+ st.metric("Sources Used", result['sources_used'])
389
  with col3:
390
  if result['sources_used'] > 0:
391
+ st.metric("Avg Relevance", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
392
 
393
+ # Display the generated answer
394
  st.write(result['answer'])
395
 
396
+ # Show sources
397
+ if st.checkbox("📚 Show Source Documents"):
398
+ st.subheader("Sources Used in Answer")
399
+ for idx, source in enumerate(result['formatted_sources'], 1):
400
  relevance = source['relevance']
401
  relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
402
+
403
  with st.expander(
404
+ f"Source {idx} - {source['type'].upper()} "
405
  f"[{relevance_bar}] {relevance:.0%}"
406
  ):
407
  st.write(source['content'])
408
+
409
+ print(f"\n✅ Answer generation complete!")
410
+
411
  except Exception as e:
412
+ st.error(f" Error processing question: {e}")
413
+ print(f"Error: {e}")
414
+
415
+
416
+ # ============================================================================
417
+ # FOOTER
418
+ # ============================================================================
419
 
420
  st.divider()
421
 
422
+ col1, col2, col3 = st.columns(3)
423
+
424
+ with col1:
425
+ st.info("📖 **Text Processing**: PyPDF2 extraction with UTF-8 support")
426
+
427
+ with col2:
428
+ st.info("🖼️ **Visual Analysis**: GPT-4o vision for image understanding")
429
+
430
+ with col3:
431
+ st.info("📊 **Vector Storage**: ChromaDB with auto-persist")
432
+
433
  st.caption(
434
+ "Multimodal RAG System | "
435
+ "Visual Image Analysis | "
436
+ "Russian Language Support | "
437
+ "Individual Component Summarization"
438
  )
src/config.py CHANGED
@@ -1,31 +1,42 @@
 
 
 
1
  import os
2
  from pathlib import Path
3
 
 
4
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
5
- OPENAI_MODEL = "gpt-4o-mini"
6
- USE_CACHE = True
7
 
 
8
  CHROMA_DB_PATH = "./chroma_db"
9
  DOCSTORE_PATH = "./docstore"
10
  PROCESSED_FILES_LOG = "./processed_files.txt"
11
 
 
12
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
13
  EMBEDDING_DIM = 768
14
 
15
- MAX_CHUNK_SIZE = 500
16
- CHUNK_OVERLAP = 50
17
- TEMPERATURE = 0.3
18
- MAX_TOKENS = 500
 
19
 
 
20
  LANGUAGE = "russian"
21
 
 
22
  Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
23
  Path(DOCSTORE_PATH).mkdir(exist_ok=True)
24
 
 
25
  UPLOAD_FOLDER = "./uploaded_pdfs"
26
  Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
27
  MAX_PDF_SIZE_MB = 50
28
 
29
- BATCH_SEARCH_RESULTS = 3
30
- CACHE_RESPONSES = True
31
- SUMMARIZE_FIRST = True
 
 
1
+ """
2
+ Configuration file for Multimodal RAG LLM System
3
+ """
4
  import os
5
  from pathlib import Path
6
 
7
+ # API Configuration
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
9
+ OPENAI_MODEL = "gpt-4o-mini" # Cheaper model variant
10
+ USE_CACHE = True # Enable response caching
11
 
12
+ # Vector Store Configuration
13
  CHROMA_DB_PATH = "./chroma_db"
14
  DOCSTORE_PATH = "./docstore"
15
  PROCESSED_FILES_LOG = "./processed_files.txt"
16
 
17
+ # Embedding Model Configuration
18
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
19
  EMBEDDING_DIM = 768
20
 
21
+ # System Configuration
22
+ MAX_CHUNK_SIZE = 500 # Smaller chunks = fewer tokens
23
+ CHUNK_OVERLAP = 50 # Less overlap = fewer chunks
24
+ TEMPERATURE = 0.3 # Lower = faster, cheaper
25
+ MAX_TOKENS = 500 # Limit response size (vs 1500)
26
 
27
+ # Language Support
28
  LANGUAGE = "russian"
29
 
30
+ # Create necessary directories
31
  Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
32
  Path(DOCSTORE_PATH).mkdir(exist_ok=True)
33
 
34
+ # PDF Upload Configuration
35
  UPLOAD_FOLDER = "./uploaded_pdfs"
36
  Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
37
  MAX_PDF_SIZE_MB = 50
38
 
39
+ # TOKEN OPTIMIZATION SETTINGS
40
+ BATCH_SEARCH_RESULTS = 3 # Return only top 3 (not 5)
41
+ CACHE_RESPONSES = True # Cache Q&A responses
42
+ SUMMARIZE_FIRST = True # Summarize PDFs once, not per query
src/pdf_parser.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import os
2
  import json
3
  import hashlib
@@ -17,21 +20,29 @@ class PDFParser:
17
  self.processed_files = self._load_processed_files()
18
  self.debug = debug
19
 
 
20
  self._configure_tesseract()
21
 
22
  if self.debug:
23
- print("PDFParser initialized")
24
 
25
  def _configure_tesseract(self):
 
26
  try:
 
 
 
 
 
27
  pytesseract.get_tesseract_version()
28
- print("Tesseract configured successfully")
29
  except Exception as e:
30
- print(f"Tesseract configuration warning: {e}")
31
 
32
  def _debug_print(self, label: str, data: any):
 
33
  if self.debug:
34
- print(f"[PDF Parser] {label}")
35
  if isinstance(data, dict):
36
  for key, val in data.items():
37
  print(f" {key}: {val}")
@@ -43,6 +54,7 @@ class PDFParser:
43
  print(f" {data}")
44
 
45
  def _load_processed_files(self) -> Dict[str, str]:
 
46
  if os.path.exists(PROCESSED_FILES_LOG):
47
  try:
48
  with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -52,10 +64,12 @@ class PDFParser:
52
  return {}
53
 
54
  def _save_processed_files(self):
 
55
  with open(PROCESSED_FILES_LOG, 'w') as f:
56
  json.dump(self.processed_files, f, indent=2)
57
 
58
  def _get_file_hash(self, file_path: str) -> str:
 
59
  hash_md5 = hashlib.md5()
60
  with open(file_path, "rb") as f:
61
  for chunk in iter(lambda: f.read(4096), b""):
@@ -63,6 +77,7 @@ class PDFParser:
63
  return hash_md5.hexdigest()
64
 
65
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
 
66
  text = ""
67
  try:
68
  with open(pdf_path, 'rb') as file:
@@ -81,31 +96,36 @@ class PDFParser:
81
  return text
82
 
83
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
 
84
  images_data = []
85
  try:
86
  self._debug_print("Image Extraction Started", f"File: {pdf_path}")
87
 
88
  images = convert_from_path(pdf_path, dpi=150)
89
- self._debug_print("PDF to Images", f"Total images: {len(images)}")
90
 
91
  for idx, image in enumerate(images):
92
  self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
93
 
 
94
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
95
  image.save(image_path)
96
  self._debug_print(f"Image {idx} Saved", str(image_path))
97
 
98
- self._debug_print(f"Image {idx} OCR")
 
99
 
100
  try:
 
101
  ocr_text = pytesseract.image_to_string(image, lang='rus')
102
 
 
103
  ocr_text = ocr_text.strip()
104
 
105
  if not ocr_text or len(ocr_text) < 5:
106
- self._debug_print(f"Image {idx} OCR Result", f"EMPTY or very short ({len(ocr_text)} chars)")
107
  else:
108
- self._debug_print(f"Image {idx} OCR Result", f"Success - {len(ocr_text)} chars: {ocr_text[:150]}")
109
 
110
  except Exception as ocr_error:
111
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
@@ -124,6 +144,7 @@ class PDFParser:
124
  return images_data
125
 
126
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
 
127
  tables_data = []
128
  try:
129
  text = self._extract_text_from_pdf(pdf_path)
@@ -156,22 +177,26 @@ class PDFParser:
156
  return tables_data
157
 
158
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
 
159
  file_hash = self._get_file_hash(pdf_path)
160
  doc_id = Path(pdf_path).stem
161
 
162
- self._debug_print("PDF Parsing Started", f"File: {doc_id}")
163
 
 
164
  if doc_id in self.processed_files:
165
  if self.processed_files[doc_id] == file_hash:
166
- self._debug_print("Status", f"File {doc_id} already processed")
167
  return self._load_extracted_data(doc_id)
168
 
169
- print(f"Processing PDF: {doc_id}")
170
 
 
171
  text = self._extract_text_from_pdf(pdf_path)
172
  images = self._extract_images_from_pdf(pdf_path, doc_id)
173
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
174
 
 
175
  self._debug_print("Extraction Summary", {
176
  'text_length': len(text),
177
  'images_count': len(images),
@@ -179,14 +204,17 @@ class PDFParser:
179
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
180
  })
181
 
 
182
  self._save_extracted_data(doc_id, text, images, tables)
183
 
 
184
  self.processed_files[doc_id] = file_hash
185
  self._save_processed_files()
186
 
187
  return text, images, tables
188
 
189
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
 
190
  data = {
191
  'text': text,
192
  'images': images,
@@ -199,6 +227,7 @@ class PDFParser:
199
  self._debug_print("Data Saved", str(data_path))
200
 
201
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
 
202
  data_path = self.docstore_path / f"{doc_id}_data.json"
203
  try:
204
  with open(data_path, 'r', encoding='utf-8') as f:
@@ -208,6 +237,7 @@ class PDFParser:
208
  return "", [], []
209
 
210
  def get_all_documents(self) -> Dict:
 
211
  all_docs = {}
212
  for json_file in self.docstore_path.glob("*_data.json"):
213
  doc_id = json_file.stem.replace("_data", "")
 
1
+ """
2
+ PDF Parser Module with FIXED Russian OCR support
3
+ """
4
  import os
5
  import json
6
  import hashlib
 
20
  self.processed_files = self._load_processed_files()
21
  self.debug = debug
22
 
23
+ # Configure Tesseract for Russian + English
24
  self._configure_tesseract()
25
 
26
  if self.debug:
27
+ print("PDFParser initialized with Russian OCR support")
28
 
29
  def _configure_tesseract(self):
30
+ """Configure Tesseract with proper paths and language support"""
31
  try:
32
+ # Windows specific path
33
+ if os.name == 'nt':
34
+ pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
35
+
36
+ # Test Tesseract
37
  pytesseract.get_tesseract_version()
38
+ print("Tesseract configured successfully")
39
  except Exception as e:
40
+ print(f"⚠️ Tesseract configuration warning: {e}")
41
 
42
  def _debug_print(self, label: str, data: any):
43
+ """Print debug information"""
44
  if self.debug:
45
+ print(f"\n🔍 [PDF Parser] {label}")
46
  if isinstance(data, dict):
47
  for key, val in data.items():
48
  print(f" {key}: {val}")
 
54
  print(f" {data}")
55
 
56
  def _load_processed_files(self) -> Dict[str, str]:
57
+ """Load list of already processed files with their hashes"""
58
  if os.path.exists(PROCESSED_FILES_LOG):
59
  try:
60
  with open(PROCESSED_FILES_LOG, 'r') as f:
 
64
  return {}
65
 
66
  def _save_processed_files(self):
67
+ """Save processed files list to disk"""
68
  with open(PROCESSED_FILES_LOG, 'w') as f:
69
  json.dump(self.processed_files, f, indent=2)
70
 
71
  def _get_file_hash(self, file_path: str) -> str:
72
+ """Generate hash of file to detect changes"""
73
  hash_md5 = hashlib.md5()
74
  with open(file_path, "rb") as f:
75
  for chunk in iter(lambda: f.read(4096), b""):
 
77
  return hash_md5.hexdigest()
78
 
79
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
80
+ """Extract text from PDF using PyPDF2"""
81
  text = ""
82
  try:
83
  with open(pdf_path, 'rb') as file:
 
96
  return text
97
 
98
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
99
+ """Extract images from PDF pages with Russian OCR support"""
100
  images_data = []
101
  try:
102
  self._debug_print("Image Extraction Started", f"File: {pdf_path}")
103
 
104
  images = convert_from_path(pdf_path, dpi=150)
105
+ self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
106
 
107
  for idx, image in enumerate(images):
108
  self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
109
 
110
+ # Save image
111
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
112
  image.save(image_path)
113
  self._debug_print(f"Image {idx} Saved", str(image_path))
114
 
115
+ # Extract text using OCR with Russian support
116
+ self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
117
 
118
  try:
119
+ # CRITICAL: Use 'rus+eng' for Russian + English support
120
  ocr_text = pytesseract.image_to_string(image, lang='rus')
121
 
122
+ # Clean up text
123
  ocr_text = ocr_text.strip()
124
 
125
  if not ocr_text or len(ocr_text) < 5:
126
+ self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
127
  else:
128
+ self._debug_print(f"Image {idx} OCR Result", f"Success - {len(ocr_text)} chars: {ocr_text[:150]}")
129
 
130
  except Exception as ocr_error:
131
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
 
144
  return images_data
145
 
146
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
147
+ """Extract table content from PDF"""
148
  tables_data = []
149
  try:
150
  text = self._extract_text_from_pdf(pdf_path)
 
177
  return tables_data
178
 
179
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
180
+ """Parse PDF and extract text, images, and tables with debug output"""
181
  file_hash = self._get_file_hash(pdf_path)
182
  doc_id = Path(pdf_path).stem
183
 
184
+ self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
185
 
186
+ # Check if file was already processed
187
  if doc_id in self.processed_files:
188
  if self.processed_files[doc_id] == file_hash:
189
+ self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
190
  return self._load_extracted_data(doc_id)
191
 
192
+ print(f"\n📄 Processing PDF: {doc_id}")
193
 
194
+ # Extract content
195
  text = self._extract_text_from_pdf(pdf_path)
196
  images = self._extract_images_from_pdf(pdf_path, doc_id)
197
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
198
 
199
+ # Summary
200
  self._debug_print("Extraction Summary", {
201
  'text_length': len(text),
202
  'images_count': len(images),
 
204
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
205
  })
206
 
207
+ # Save extracted data
208
  self._save_extracted_data(doc_id, text, images, tables)
209
 
210
+ # Update processed files log
211
  self.processed_files[doc_id] = file_hash
212
  self._save_processed_files()
213
 
214
  return text, images, tables
215
 
216
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
217
+ """Save extracted data to docstore"""
218
  data = {
219
  'text': text,
220
  'images': images,
 
227
  self._debug_print("Data Saved", str(data_path))
228
 
229
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
230
+ """Load previously extracted data from docstore"""
231
  data_path = self.docstore_path / f"{doc_id}_data.json"
232
  try:
233
  with open(data_path, 'r', encoding='utf-8') as f:
 
237
  return "", [], []
238
 
239
  def get_all_documents(self) -> Dict:
240
+ """Load all processed documents from docstore"""
241
  all_docs = {}
242
  for json_file in self.docstore_path.glob("*_data.json"):
243
  doc_id = json_file.stem.replace("_data", "")
src/rag_system.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  from typing import List, Dict
2
  from langchain_openai import ChatOpenAI
3
  from langchain_core.messages import HumanMessage, SystemMessage
@@ -11,14 +16,21 @@ from config import (
11
 
12
 
13
  class VisualMultimodalRAG:
 
 
 
 
 
 
 
14
 
15
  def __init__(self, api_key: str = None, debug: bool = True):
16
  api_key = api_key or OPENAI_API_KEY
17
  self.debug = debug
18
 
19
-
20
  self.llm = ChatOpenAI(
21
- model_name="gpt-4o-mini",
22
  api_key=api_key,
23
  temperature=TEMPERATURE,
24
  max_tokens=MAX_TOKENS,
@@ -28,11 +40,12 @@ class VisualMultimodalRAG:
28
  self.visual_summaries_log = []
29
 
30
  if self.debug:
31
- print("VisualMultimodalRAG initialized")
32
 
33
  def _debug_print(self, label: str, data: any):
 
34
  if self.debug:
35
- print(f"DEBUG [{label}]:")
36
  if isinstance(data, (list, dict)):
37
  print(f" Type: {type(data).__name__}")
38
  print(f" Content: {str(data)[:300]}...")
@@ -40,6 +53,7 @@ class VisualMultimodalRAG:
40
  print(f" {data}")
41
 
42
  def _image_to_base64(self, image_path: str) -> str:
 
43
  try:
44
  with open(image_path, 'rb') as image_file:
45
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
@@ -49,14 +63,28 @@ class VisualMultimodalRAG:
49
  return None
50
 
51
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
52
  if not os.path.exists(image_path):
53
  return f"[Image {image_idx}: File not found - {image_path}]"
54
 
55
  try:
 
56
  image_base64 = self._image_to_base64(image_path)
57
  if not image_base64:
58
  return f"[Image {image_idx}: Could not convert to base64]"
59
 
 
60
  file_ext = Path(image_path).suffix.lower()
61
  media_type_map = {
62
  '.jpg': 'image/jpeg',
@@ -67,8 +95,9 @@ class VisualMultimodalRAG:
67
  }
68
  media_type = media_type_map.get(file_ext, 'image/png')
69
 
70
- print(f"Analyzing image {image_idx}...")
71
 
 
72
  message = HumanMessage(
73
  content=[
74
  {
@@ -79,44 +108,52 @@ class VisualMultimodalRAG:
79
  },
80
  {
81
  "type": "text",
82
- "text": f"""You are assistant for analyzing and aggregating information. Analyze this image.
83
- Provide a visual analysis that includes:
84
- 1. Main objects and element
85
- 2. Data/Content - Any numbers, text, charts, graphs
86
- 3. What this image is showing or representing
87
- 4. Important patterns, trends, or information
88
- 5. How image relates to document content
89
- Be brief and meaningful. Focus on visual information that cannot be extracted from text. Response on {self.language}.
 
 
 
90
  Analysis:"""
91
  }
92
  ],
93
  )
94
 
 
95
  response = self.llm.invoke([message])
96
  analysis = response.content.strip()
97
 
98
  if self.debug:
99
  self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
100
 
101
- print(f"Image {image_idx} analyzed successfully")
102
  return analysis
103
 
104
  except Exception as e:
105
  error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
106
- print(f"Error analyzing image {image_idx}: {e}")
107
  return error_msg
108
 
109
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
110
-
 
 
 
111
  visual_analyses = []
112
 
113
  for idx, image in enumerate(images):
114
  image_path = image.get('path', '')
115
 
116
  if not image_path:
117
- print(f"Image {idx}: No path provided")
118
  continue
119
 
 
120
  visual_analysis = self.analyze_image_visually(image_path, idx)
121
 
122
  visual_analyses.append({
@@ -124,12 +161,15 @@ Analysis:"""
124
  'image_index': idx,
125
  'image_path': image_path,
126
  'visual_analysis': visual_analysis,
127
- 'ocr_text': image.get('ocr_text', '')
128
  })
129
 
130
  return visual_analyses
131
 
132
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
 
 
 
133
  chunks = []
134
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
135
 
@@ -141,12 +181,12 @@ Analysis:"""
141
 
142
  try:
143
  prompt = f"""Summarize this text chunk in {self.language}.
144
- Be brief and meaningful. Extract key points, facts, and main ideas.
145
 
146
  Text Chunk:
147
  {chunk}
148
 
149
- Summary:"""
150
 
151
  message = HumanMessage(content=prompt)
152
  response = self.llm.invoke([message])
@@ -169,6 +209,9 @@ Summary:"""
169
  return chunks
170
 
171
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
 
 
 
172
  summaries = []
173
 
174
  for idx, table in enumerate(tables):
@@ -179,12 +222,12 @@ Summary:"""
179
 
180
  try:
181
  prompt = f"""Analyze and summarize this table/structured data in {self.language}.
182
- Extract key insights, row/column meanings, and important figures. Be brief and meaningful.
183
 
184
  Table Content:
185
  {table_content}
186
 
187
- Summary:"""
188
 
189
  message = HumanMessage(content=prompt)
190
  response = self.llm.invoke([message])
@@ -214,9 +257,13 @@ Summary:"""
214
  vector_store,
215
  doc_id: str
216
  ) -> Dict:
217
-
 
 
 
 
218
  print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
219
-
220
 
221
  results = {
222
  'doc_id': doc_id,
@@ -226,13 +273,14 @@ Summary:"""
226
  'total_stored': 0
227
  }
228
 
229
-
230
- print(f"VISUAL IMAGE ANALYSIS ({len(images)} total)")
231
-
232
 
233
  image_analyses = self.analyze_images_visually(images)
234
  results['image_visual_analyses'] = image_analyses
235
 
 
236
  image_docs = {
237
  'text': ' | '.join([
238
  f"Image {a['image_index']}: {a['visual_analysis']}"
@@ -243,7 +291,7 @@ Summary:"""
243
  }
244
 
245
  for analysis in image_analyses:
246
- print(f" Image {analysis['image_index']} (visual analysis)")
247
  print(f" Path: {analysis['image_path']}")
248
  print(f" Analysis: {analysis['visual_analysis'][:100]}...")
249
 
@@ -254,11 +302,13 @@ Summary:"""
254
  f"{doc_id}_images_visual"
255
  )
256
  results['total_stored'] += len(image_analyses)
257
- print(f" Stored {len(image_analyses)} image visual analyses")
258
  except Exception as e:
259
- print(f" Error storing image analyses: {e}")
260
 
261
- print(f" TEXT CHUNK SUMMARIZATION")
 
 
262
 
263
  text_summaries = self.summarize_text_chunks(text)
264
  results['text_summaries'] = text_summaries
@@ -271,7 +321,7 @@ Summary:"""
271
  }
272
 
273
  for summary in text_summaries:
274
- print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
275
 
276
  if text_summaries:
277
  try:
@@ -280,11 +330,13 @@ Summary:"""
280
  f"{doc_id}_text_chunks"
281
  )
282
  results['total_stored'] += len(text_summaries)
283
- print(f" Stored {len(text_summaries)} text chunk summaries")
284
  except Exception as e:
285
- print(f" Error storing text summaries: {e}")
286
 
287
- print(f" TABLE SUMMARIZATION ({len(tables)} total)")
 
 
288
 
289
  table_summaries = self.summarize_tables(tables)
290
  results['table_summaries'] = table_summaries
@@ -297,7 +349,7 @@ Summary:"""
297
  }
298
 
299
  for summary in table_summaries:
300
- print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
301
 
302
  if table_summaries:
303
  try:
@@ -306,15 +358,19 @@ Summary:"""
306
  f"{doc_id}_tables"
307
  )
308
  results['total_stored'] += len(table_summaries)
309
- print(f" Stored {len(table_summaries)} table summaries")
310
  except Exception as e:
311
- print(f" Error storing table summaries: {e}")
312
 
313
- print(f" STORAGE SUMMARY")
 
 
 
314
  print(f" Images analyzed visually & stored: {len(image_analyses)}")
315
  print(f" Text chunks summarized & stored: {len(text_summaries)}")
316
  print(f" Tables summarized & stored: {len(table_summaries)}")
317
  print(f" Total items stored in vector: {results['total_stored']}")
 
318
 
319
  self.visual_summaries_log.append(results)
320
  return results
@@ -335,13 +391,19 @@ Summary:"""
335
 
336
 
337
  class AnsweringRAG:
 
 
 
 
 
 
338
 
339
  def __init__(self, api_key: str = None, debug: bool = True):
340
  api_key = api_key or OPENAI_API_KEY
341
  self.debug = debug
342
 
343
  self.llm = ChatOpenAI(
344
- model_name="gpt-4o-mini",
345
  api_key=api_key,
346
  temperature=TEMPERATURE,
347
  max_tokens=MAX_TOKENS,
@@ -351,11 +413,12 @@ class AnsweringRAG:
351
  self.answer_log = []
352
 
353
  if self.debug:
354
- print("AnsweringRAG initialized ")
355
 
356
  def _debug_print(self, label: str, data: any):
 
357
  if self.debug:
358
- print(f" DEBUG [{label}]:")
359
  if isinstance(data, (list, dict)):
360
  print(f" Type: {type(data).__name__}")
361
  print(f" Content: {str(data)[:300]}...")
@@ -367,17 +430,35 @@ class AnsweringRAG:
367
  question: str,
368
  search_results: List[Dict]
369
  ) -> Dict:
370
-
 
 
 
 
 
 
 
 
 
 
 
371
 
 
372
  print(f"ANALYZING QUESTION & GENERATING ANSWER")
 
373
 
374
- print(f"Question: {question}")
375
- print(f"Search Results Found: {len(search_results)}")
376
 
 
377
  if not search_results:
378
- print(f"No search results found!")
379
- answer = f"""No relevant information in the document to answer your question: "{question}"
380
- """
 
 
 
 
381
 
382
  result = {
383
  'question': question,
@@ -389,6 +470,7 @@ class AnsweringRAG:
389
  self.answer_log.append(result)
390
  return result
391
 
 
392
  context_parts = []
393
  for idx, result in enumerate(search_results, 1):
394
  content = result.get('content', '')
@@ -405,6 +487,7 @@ class AnsweringRAG:
405
 
406
  self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
407
 
 
408
  analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
409
 
410
  USER QUESTION:
@@ -420,20 +503,23 @@ INSTRUCTIONS:
420
  4. If the content doesn't fully answer the question, explain what information is available
421
  5. Be specific and cite the content when relevant
422
  6. Structure your answer clearly with key points
 
423
  ANSWER:"""
424
 
425
- print(f"Analyzing search results...")
426
  print(f" Context size: {len(full_context)} characters")
427
  print(f" Sources: {len(search_results)}")
428
 
429
  try:
 
430
  message = HumanMessage(content=analysis_prompt)
431
  response = self.llm.invoke([message])
432
  answer = response.content.strip()
433
 
 
434
  confidence = self._estimate_confidence(len(search_results), answer)
435
 
436
- print(f" Answer generated successfully")
437
  print(f" Confidence: {confidence}")
438
  print(f" Answer length: {len(answer)} characters")
439
 
@@ -449,7 +535,7 @@ ANSWER:"""
449
  return result
450
 
451
  except Exception as e:
452
- print(f" Error generating answer: {e}")
453
  answer = f"I encountered an error while analyzing the search results. Please try again."
454
 
455
  result = {
@@ -465,14 +551,18 @@ ANSWER:"""
465
  return result
466
 
467
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
 
468
  answer_length = len(answer)
469
 
 
470
  if sources_count >= 3 and answer_length > 500:
471
  return "high"
472
 
 
473
  elif sources_count >= 2 and answer_length > 200:
474
  return "medium"
475
 
 
476
  else:
477
  return "low"
478
 
@@ -481,9 +571,14 @@ ANSWER:"""
481
  question: str,
482
  search_results: List[Dict]
483
  ) -> Dict:
 
 
 
 
484
 
485
  result = self.analyze_and_answer(question, search_results)
486
 
 
487
  formatted_sources = []
488
  for idx, source in enumerate(result['search_results'], 1):
489
  formatted_sources.append({
@@ -497,18 +592,25 @@ ANSWER:"""
497
  return result
498
 
499
  def get_answer_log(self) -> List[Dict]:
 
500
  return self.answer_log
501
 
502
  def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
 
503
 
 
504
  print(f"ANSWER TO: {result['question']}")
 
505
 
506
- print(f"ANSWER (Confidence: {result['confidence'].upper()}):")
 
507
  print(result['answer'])
 
508
 
509
  if result.get('formatted_sources'):
510
- print(f"SOURCES USED ({len(result['formatted_sources'])} total):")
511
  for source in result['formatted_sources']:
512
  print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
513
  print(f"{source['content'][:max_source_length]}...")
514
 
 
 
1
+ """
2
+ Enhanced RAG System - Visual Image Analysis
3
+ Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
4
+ Then stores results in vector store
5
+ """
6
  from typing import List, Dict
7
  from langchain_openai import ChatOpenAI
8
  from langchain_core.messages import HumanMessage, SystemMessage
 
16
 
17
 
18
  class VisualMultimodalRAG:
19
+ """
20
+ RAG system that:
21
+ 1. Sends images as base64 to GPT-4o for visual analysis
22
+ 2. Gets detailed visual descriptions and insights
23
+ 3. Stores visual analysis in vector store
24
+ 4. Enables image-based semantic search
25
+ """
26
 
27
  def __init__(self, api_key: str = None, debug: bool = True):
28
  api_key = api_key or OPENAI_API_KEY
29
  self.debug = debug
30
 
31
+ # Use gpt-4o for vision capabilities
32
  self.llm = ChatOpenAI(
33
+ model_name="gpt-4o-mini", # CRITICAL: gpt-4o has vision
34
  api_key=api_key,
35
  temperature=TEMPERATURE,
36
  max_tokens=MAX_TOKENS,
 
40
  self.visual_summaries_log = []
41
 
42
  if self.debug:
43
+ print("VisualMultimodalRAG initialized with gpt-4o (vision model)")
44
 
45
  def _debug_print(self, label: str, data: any):
46
+ """Print debug information"""
47
  if self.debug:
48
+ print(f"\n🔍 DEBUG [{label}]:")
49
  if isinstance(data, (list, dict)):
50
  print(f" Type: {type(data).__name__}")
51
  print(f" Content: {str(data)[:300]}...")
 
53
  print(f" {data}")
54
 
55
  def _image_to_base64(self, image_path: str) -> str:
56
+ """Convert image file to base64 string"""
57
  try:
58
  with open(image_path, 'rb') as image_file:
59
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
 
63
  return None
64
 
65
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
66
+ """
67
+ Send actual image (base64) to gpt-4o for visual analysis
68
+ Returns detailed visual analysis/description
69
+
70
+ gpt-4o can see:
71
+ - Charts, graphs, diagrams
72
+ - Tables and structured data
73
+ - Photos and drawings
74
+ - Handwritten text
75
+ - Screenshots
76
+ - Any visual content
77
+ """
78
  if not os.path.exists(image_path):
79
  return f"[Image {image_idx}: File not found - {image_path}]"
80
 
81
  try:
82
+ # Convert image to base64
83
  image_base64 = self._image_to_base64(image_path)
84
  if not image_base64:
85
  return f"[Image {image_idx}: Could not convert to base64]"
86
 
87
+ # Determine image type
88
  file_ext = Path(image_path).suffix.lower()
89
  media_type_map = {
90
  '.jpg': 'image/jpeg',
 
95
  }
96
  media_type = media_type_map.get(file_ext, 'image/png')
97
 
98
+ print(f"🔍 Analyzing image {image_idx} visually (as {media_type})...")
99
 
100
+ # Create message with image
101
  message = HumanMessage(
102
  content=[
103
  {
 
108
  },
109
  {
110
  "type": "text",
111
+ "text": f"""Analyze this image in detail in {self.language}.
112
+
113
+ Provide a comprehensive visual analysis including:
114
+ 1. **What you see** - Main objects, elements, structure
115
+ 2. **Data/Content** - Any numbers, text, charts, graphs
116
+ 3. **Purpose** - What this image is showing or representing
117
+ 4. **Key insights** - Important patterns, trends, or information
118
+ 5. **Connections** - How this relates to document content
119
+
120
+ Be specific and detailed. Focus on visual information that cannot be extracted from text alone.
121
+
122
  Analysis:"""
123
  }
124
  ],
125
  )
126
 
127
+ # Call gpt-4o with vision
128
  response = self.llm.invoke([message])
129
  analysis = response.content.strip()
130
 
131
  if self.debug:
132
  self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
133
 
134
+ print(f"Image {image_idx} analyzed successfully")
135
  return analysis
136
 
137
  except Exception as e:
138
  error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
139
+ print(f"Error analyzing image {image_idx}: {e}")
140
  return error_msg
141
 
142
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
143
+ """
144
+ Analyze each image visually using gpt-4o vision
145
+ Returns list of {image_index, visual_analysis, type}
146
+ """
147
  visual_analyses = []
148
 
149
  for idx, image in enumerate(images):
150
  image_path = image.get('path', '')
151
 
152
  if not image_path:
153
+ print(f"⚠️ Image {idx}: No path provided")
154
  continue
155
 
156
+ # Analyze image visually (not just OCR)
157
  visual_analysis = self.analyze_image_visually(image_path, idx)
158
 
159
  visual_analyses.append({
 
161
  'image_index': idx,
162
  'image_path': image_path,
163
  'visual_analysis': visual_analysis,
164
+ 'ocr_text': image.get('ocr_text', '') # Keep OCR as backup
165
  })
166
 
167
  return visual_analyses
168
 
169
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
170
+ """
171
+ Chunk text and summarize each chunk individually
172
+ """
173
  chunks = []
174
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
175
 
 
181
 
182
  try:
183
  prompt = f"""Summarize this text chunk in {self.language}.
184
+ Keep it concise. Extract key points, facts, and main ideas.
185
 
186
  Text Chunk:
187
  {chunk}
188
 
189
+ Summary (2-3 sentences maximum):"""
190
 
191
  message = HumanMessage(content=prompt)
192
  response = self.llm.invoke([message])
 
209
  return chunks
210
 
211
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
212
+ """
213
+ Summarize each table individually
214
+ """
215
  summaries = []
216
 
217
  for idx, table in enumerate(tables):
 
222
 
223
  try:
224
  prompt = f"""Analyze and summarize this table/structured data in {self.language}.
225
+ Extract key insights, row/column meanings, and important figures.
226
 
227
  Table Content:
228
  {table_content}
229
 
230
+ Summary (2-3 sentences maximum):"""
231
 
232
  message = HumanMessage(content=prompt)
233
  response = self.llm.invoke([message])
 
257
  vector_store,
258
  doc_id: str
259
  ) -> Dict:
260
+ """
261
+ Main function: Analyze all components visually and store in vector store
262
+ Images are analyzed using gpt-4o vision (not just OCR)
263
+ """
264
+ print(f"\n{'='*70}")
265
  print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
266
+ print(f"{'='*70}")
267
 
268
  results = {
269
  'doc_id': doc_id,
 
273
  'total_stored': 0
274
  }
275
 
276
+ # 1. Analyze images VISUALLY using gpt-4o
277
+ print(f"\n🖼️ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
278
+ print(f"{'─'*70}")
279
 
280
  image_analyses = self.analyze_images_visually(images)
281
  results['image_visual_analyses'] = image_analyses
282
 
283
+ # Store each image analysis in vector store
284
  image_docs = {
285
  'text': ' | '.join([
286
  f"Image {a['image_index']}: {a['visual_analysis']}"
 
291
  }
292
 
293
  for analysis in image_analyses:
294
+ print(f"Image {analysis['image_index']} (visual analysis)")
295
  print(f" Path: {analysis['image_path']}")
296
  print(f" Analysis: {analysis['visual_analysis'][:100]}...")
297
 
 
302
  f"{doc_id}_images_visual"
303
  )
304
  results['total_stored'] += len(image_analyses)
305
+ print(f" Stored {len(image_analyses)} image visual analyses")
306
  except Exception as e:
307
+ print(f" Error storing image analyses: {e}")
308
 
309
+ # 2. Summarize and store text chunks
310
+ print(f"\n📝 TEXT CHUNK SUMMARIZATION")
311
+ print(f"{'─'*70}")
312
 
313
  text_summaries = self.summarize_text_chunks(text)
314
  results['text_summaries'] = text_summaries
 
321
  }
322
 
323
  for summary in text_summaries:
324
+ print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
325
 
326
  if text_summaries:
327
  try:
 
330
  f"{doc_id}_text_chunks"
331
  )
332
  results['total_stored'] += len(text_summaries)
333
+ print(f" Stored {len(text_summaries)} text chunk summaries")
334
  except Exception as e:
335
+ print(f" Error storing text summaries: {e}")
336
 
337
+ # 3. Summarize and store tables
338
+ print(f"\n📋 TABLE SUMMARIZATION ({len(tables)} total)")
339
+ print(f"{'─'*70}")
340
 
341
  table_summaries = self.summarize_tables(tables)
342
  results['table_summaries'] = table_summaries
 
349
  }
350
 
351
  for summary in table_summaries:
352
+ print(f"Table {summary['table_index']}: {summary['summary'][:50]}...")
353
 
354
  if table_summaries:
355
  try:
 
358
  f"{doc_id}_tables"
359
  )
360
  results['total_stored'] += len(table_summaries)
361
+ print(f" Stored {len(table_summaries)} table summaries")
362
  except Exception as e:
363
+ print(f" Error storing table summaries: {e}")
364
 
365
+ # 4. Summary statistics
366
+ print(f"\n{'='*70}")
367
+ print(f"📊 STORAGE SUMMARY")
368
+ print(f"{'='*70}")
369
  print(f" Images analyzed visually & stored: {len(image_analyses)}")
370
  print(f" Text chunks summarized & stored: {len(text_summaries)}")
371
  print(f" Tables summarized & stored: {len(table_summaries)}")
372
  print(f" Total items stored in vector: {results['total_stored']}")
373
+ print(f"{'='*70}")
374
 
375
  self.visual_summaries_log.append(results)
376
  return results
 
391
 
392
 
393
  class AnsweringRAG:
394
+ """
395
+ RAG system that:
396
+ 1. Searches vector store for relevant content
397
+ 2. ANALYZES search results
398
+ 3. Generates intelligent answers based on context
399
+ """
400
 
401
  def __init__(self, api_key: str = None, debug: bool = True):
402
  api_key = api_key or OPENAI_API_KEY
403
  self.debug = debug
404
 
405
  self.llm = ChatOpenAI(
406
+ model_name="gpt-4o-mini", # Use gpt-4o for better understanding
407
  api_key=api_key,
408
  temperature=TEMPERATURE,
409
  max_tokens=MAX_TOKENS,
 
413
  self.answer_log = []
414
 
415
  if self.debug:
416
+ print("AnsweringRAG initialized with answer generation")
417
 
418
  def _debug_print(self, label: str, data: any):
419
+ """Print debug information"""
420
  if self.debug:
421
+ print(f"\n🔍 DEBUG [{label}]:")
422
  if isinstance(data, (list, dict)):
423
  print(f" Type: {type(data).__name__}")
424
  print(f" Content: {str(data)[:300]}...")
 
430
  question: str,
431
  search_results: List[Dict]
432
  ) -> Dict:
433
+ """
434
+ Analyze search results and generate intelligent answer
435
+
436
+ Returns:
437
+ {
438
+ 'question': user question,
439
+ 'answer': detailed answer,
440
+ 'sources_used': number of sources,
441
+ 'confidence': low/medium/high,
442
+ 'search_results': original search results
443
+ }
444
+ """
445
 
446
+ print(f"\n{'='*70}")
447
  print(f"ANALYZING QUESTION & GENERATING ANSWER")
448
+ print(f"{'='*70}")
449
 
450
+ print(f"\n❓ Question: {question}")
451
+ print(f"📊 Search Results Found: {len(search_results)}")
452
 
453
+ # Check if we have search results
454
  if not search_results:
455
+ print(f"⚠️ No search results found!")
456
+ answer = f"""I could not find relevant information in the document to answer your question: "{question}"
457
+
458
+ Try:
459
+ - Using different keywords
460
+ - Breaking the question into smaller parts
461
+ - Asking about other topics in the document"""
462
 
463
  result = {
464
  'question': question,
 
470
  self.answer_log.append(result)
471
  return result
472
 
473
+ # Build context from search results
474
  context_parts = []
475
  for idx, result in enumerate(search_results, 1):
476
  content = result.get('content', '')
 
487
 
488
  self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
489
 
490
+ # Build prompt to analyze results and answer question
491
  analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
492
 
493
  USER QUESTION:
 
503
  4. If the content doesn't fully answer the question, explain what information is available
504
  5. Be specific and cite the content when relevant
505
  6. Structure your answer clearly with key points
506
+
507
  ANSWER:"""
508
 
509
+ print(f"\n🔍 Analyzing search results...")
510
  print(f" Context size: {len(full_context)} characters")
511
  print(f" Sources: {len(search_results)}")
512
 
513
  try:
514
+ # Call LLM to analyze and answer
515
  message = HumanMessage(content=analysis_prompt)
516
  response = self.llm.invoke([message])
517
  answer = response.content.strip()
518
 
519
+ # Determine confidence level
520
  confidence = self._estimate_confidence(len(search_results), answer)
521
 
522
+ print(f" Answer generated successfully")
523
  print(f" Confidence: {confidence}")
524
  print(f" Answer length: {len(answer)} characters")
525
 
 
535
  return result
536
 
537
  except Exception as e:
538
+ print(f" Error generating answer: {e}")
539
  answer = f"I encountered an error while analyzing the search results. Please try again."
540
 
541
  result = {
 
551
  return result
552
 
553
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
554
+ """Estimate confidence level of answer"""
555
  answer_length = len(answer)
556
 
557
+ # High confidence: multiple sources, substantial answer
558
  if sources_count >= 3 and answer_length > 500:
559
  return "high"
560
 
561
+ # Medium confidence: some sources, decent answer
562
  elif sources_count >= 2 and answer_length > 200:
563
  return "medium"
564
 
565
+ # Low confidence: few sources or short answer
566
  else:
567
  return "low"
568
 
 
571
  question: str,
572
  search_results: List[Dict]
573
  ) -> Dict:
574
+ """
575
+ Get answer AND properly formatted sources
576
+ Returns both answer and formatted source citations
577
+ """
578
 
579
  result = self.analyze_and_answer(question, search_results)
580
 
581
+ # Format sources for display
582
  formatted_sources = []
583
  for idx, source in enumerate(result['search_results'], 1):
584
  formatted_sources.append({
 
592
  return result
593
 
594
  def get_answer_log(self) -> List[Dict]:
595
+ """Get all answer generation logs"""
596
  return self.answer_log
597
 
598
  def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
599
+ """Pretty print answer with sources"""
600
 
601
+ print(f"\n{'='*70}")
602
  print(f"ANSWER TO: {result['question']}")
603
+ print(f"{'='*70}")
604
 
605
+ print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
606
+ print(f"{'-'*70}")
607
  print(result['answer'])
608
+ print(f"{'-'*70}")
609
 
610
  if result.get('formatted_sources'):
611
+ print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
612
  for source in result['formatted_sources']:
613
  print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
614
  print(f"{source['content'][:max_source_length]}...")
615
 
616
+ print(f"\n{'='*70}")
src/vector_store.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import os
2
  import json
3
  from typing import List, Dict
@@ -8,12 +12,14 @@ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
8
 
9
 
10
  class CLIPEmbedder:
 
11
  def __init__(self, model_name: str = EMBEDDING_MODEL):
12
- print(f" Loading embedding model: {model_name}")
13
  self.model = SentenceTransformer(model_name)
14
- print(f" Model loaded successfully")
15
 
16
  def embed(self, text: str) -> List[float]:
 
17
  try:
18
  embedding = self.model.encode(text, convert_to_numpy=False)
19
  return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
@@ -22,6 +28,7 @@ class CLIPEmbedder:
22
  return [0.0] * EMBEDDING_DIM
23
 
24
  def embed_batch(self, texts: List[str]) -> List[List[float]]:
 
25
  try:
26
  embeddings = self.model.encode(texts, convert_to_numpy=False)
27
  return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
@@ -31,30 +38,34 @@ class CLIPEmbedder:
31
 
32
 
33
  class VectorStore:
 
34
  def __init__(self):
35
  self.persist_directory = CHROMA_DB_PATH
36
  self.embedder = CLIPEmbedder()
37
 
38
- print(f" Initializing ChromaDB at: {self.persist_directory}")
39
 
 
40
  try:
41
  self.client = chromadb.PersistentClient(
42
  path=self.persist_directory
43
  )
44
- print(f" ChromaDB initialized")
45
  except Exception as e:
46
- print(f" Error initializing ChromaDB: {e}")
 
47
  self.client = chromadb.PersistentClient(
48
  path=self.persist_directory
49
  )
50
 
 
51
  try:
52
  self.collection = self.client.get_or_create_collection(
53
  name="multimodal_rag",
54
  metadata={"hnsw:space": "cosine"}
55
  )
56
  count = self.collection.count()
57
- print(f" Collection loaded: {count} items in store")
58
  except Exception as e:
59
  print(f"Error with collection: {e}")
60
  self.collection = self.client.get_or_create_collection(
@@ -62,12 +73,14 @@ class VectorStore:
62
  )
63
 
64
  def add_documents(self, documents: List[Dict], doc_id: str):
 
65
  texts = []
66
  metadatas = []
67
  ids = []
68
 
69
- print(f" Adding documents for: {doc_id}")
70
 
 
71
  if 'text' in documents and documents['text']:
72
  chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
73
  for idx, chunk in enumerate(chunks):
@@ -78,8 +91,9 @@ class VectorStore:
78
  'chunk_idx': str(idx)
79
  })
80
  ids.append(f"{doc_id}_text_{idx}")
81
- print(f" Text: {len(chunks)} chunks")
82
 
 
83
  if 'images' in documents:
84
  image_count = 0
85
  for idx, image_data in enumerate(documents['images']):
@@ -94,8 +108,9 @@ class VectorStore:
94
  ids.append(f"{doc_id}_image_{idx}")
95
  image_count += 1
96
  if image_count > 0:
97
- print(f" Images: {image_count} with OCR text")
98
 
 
99
  if 'tables' in documents:
100
  table_count = 0
101
  for idx, table_data in enumerate(documents['tables']):
@@ -109,12 +124,14 @@ class VectorStore:
109
  ids.append(f"{doc_id}_table_{idx}")
110
  table_count += 1
111
  if table_count > 0:
112
- print(f" Tables: {table_count}")
113
 
114
  if texts:
115
- print(f" Generating {len(texts)} embeddings...")
 
116
  embeddings = self.embedder.embed_batch(texts)
117
 
 
118
  try:
119
  self.collection.add(
120
  ids=ids,
@@ -122,10 +139,11 @@ class VectorStore:
122
  embeddings=embeddings,
123
  metadatas=metadatas
124
  )
125
- print(f" Successfully added {len(texts)} items to vector store")
126
- print(f" Data persisted automatically to: {self.persist_directory}")
 
127
  except Exception as e:
128
- print(f" Error adding to collection: {e}")
129
 
130
  def search(self, query: str, n_results: int = 5) -> List[Dict]:
131
  """Search vector store for similar documents"""
@@ -137,6 +155,7 @@ class VectorStore:
137
  n_results=n_results
138
  )
139
 
 
140
  formatted_results = []
141
  if results['documents']:
142
  for i, doc in enumerate(results['documents'][0]):
@@ -156,6 +175,7 @@ class VectorStore:
156
  return []
157
 
158
  def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
 
159
  chunks = []
160
  start = 0
161
  while start < len(text):
@@ -165,6 +185,7 @@ class VectorStore:
165
  return chunks
166
 
167
  def get_collection_info(self) -> Dict:
 
168
  try:
169
  count = self.collection.count()
170
  return {
@@ -178,25 +199,35 @@ class VectorStore:
178
  return {'status': 'error', 'message': str(e)}
179
 
180
  def delete_by_doc_id(self, doc_id: str):
 
181
  try:
 
182
  results = self.collection.get(where={'doc_id': doc_id})
183
  if results['ids']:
184
  self.collection.delete(ids=results['ids'])
185
- print(f" Deleted {len(results['ids'])} documents for {doc_id}")
 
 
186
  except Exception as e:
187
  print(f"Error deleting documents: {e}")
188
 
189
  def persist(self):
190
-
191
- print(" Vector store is using auto-persist")
 
 
 
 
192
 
193
  def clear_all(self):
 
194
  try:
 
195
  self.client.delete_collection(name="multimodal_rag")
196
  self.collection = self.client.get_or_create_collection(
197
  name="multimodal_rag",
198
  metadata={"hnsw:space": "cosine"}
199
  )
200
- print(" Collection cleared and reset")
201
  except Exception as e:
202
  print(f"Error clearing collection: {e}")
 
1
+ """
2
+ Vector Store and Embeddings Module using ChromaDB with sentence-transformers
3
+ UPDATED for ChromaDB v0.4.22+ (auto-persist, no manual persist needed)
4
+ """
5
  import os
6
  import json
7
  from typing import List, Dict
 
12
 
13
 
14
  class CLIPEmbedder:
15
+ """Custom embedder using sentence-transformers for multimodal content"""
16
  def __init__(self, model_name: str = EMBEDDING_MODEL):
17
+ print(f"🔄 Loading embedding model: {model_name}")
18
  self.model = SentenceTransformer(model_name)
19
+ print(f" Model loaded successfully")
20
 
21
  def embed(self, text: str) -> List[float]:
22
+ """Generate embedding for text"""
23
  try:
24
  embedding = self.model.encode(text, convert_to_numpy=False)
25
  return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
 
28
  return [0.0] * EMBEDDING_DIM
29
 
30
  def embed_batch(self, texts: List[str]) -> List[List[float]]:
31
+ """Generate embeddings for batch of texts"""
32
  try:
33
  embeddings = self.model.encode(texts, convert_to_numpy=False)
34
  return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
 
38
 
39
 
40
  class VectorStore:
41
+ """Vector store manager using ChromaDB (v0.4.22+ with auto-persist)"""
42
  def __init__(self):
43
  self.persist_directory = CHROMA_DB_PATH
44
  self.embedder = CLIPEmbedder()
45
 
46
+ print(f"\n🔄 Initializing ChromaDB at: {self.persist_directory}")
47
 
48
+ # NEW ChromaDB v0.4.22+ - PersistentClient auto-persists
49
  try:
50
  self.client = chromadb.PersistentClient(
51
  path=self.persist_directory
52
  )
53
+ print(f" ChromaDB PersistentClient initialized")
54
  except Exception as e:
55
+ print(f" Error initializing ChromaDB: {e}")
56
+ print(f"Trying fallback initialization...")
57
  self.client = chromadb.PersistentClient(
58
  path=self.persist_directory
59
  )
60
 
61
+ # Get or create collection
62
  try:
63
  self.collection = self.client.get_or_create_collection(
64
  name="multimodal_rag",
65
  metadata={"hnsw:space": "cosine"}
66
  )
67
  count = self.collection.count()
68
+ print(f" Collection loaded: {count} items in store")
69
  except Exception as e:
70
  print(f"Error with collection: {e}")
71
  self.collection = self.client.get_or_create_collection(
 
73
  )
74
 
75
  def add_documents(self, documents: List[Dict], doc_id: str):
76
+ """Add documents to vector store"""
77
  texts = []
78
  metadatas = []
79
  ids = []
80
 
81
+ print(f"\n📚 Adding documents for: {doc_id}")
82
 
83
+ # Add text chunks
84
  if 'text' in documents and documents['text']:
85
  chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
86
  for idx, chunk in enumerate(chunks):
 
91
  'chunk_idx': str(idx)
92
  })
93
  ids.append(f"{doc_id}_text_{idx}")
94
+ print(f"Text: {len(chunks)} chunks")
95
 
96
+ # Add image descriptions and OCR text
97
  if 'images' in documents:
98
  image_count = 0
99
  for idx, image_data in enumerate(documents['images']):
 
108
  ids.append(f"{doc_id}_image_{idx}")
109
  image_count += 1
110
  if image_count > 0:
111
+ print(f"Images: {image_count} with OCR text")
112
 
113
+ # Add table content
114
  if 'tables' in documents:
115
  table_count = 0
116
  for idx, table_data in enumerate(documents['tables']):
 
124
  ids.append(f"{doc_id}_table_{idx}")
125
  table_count += 1
126
  if table_count > 0:
127
+ print(f"Tables: {table_count}")
128
 
129
  if texts:
130
+ # Generate embeddings
131
+ print(f" 🔄 Generating {len(texts)} embeddings...")
132
  embeddings = self.embedder.embed_batch(texts)
133
 
134
+ # Add to collection
135
  try:
136
  self.collection.add(
137
  ids=ids,
 
139
  embeddings=embeddings,
140
  metadatas=metadatas
141
  )
142
+ print(f" Successfully added {len(texts)} items to vector store")
143
+ # Auto-persist happens here
144
+ print(f"✅ Data persisted automatically to: {self.persist_directory}")
145
  except Exception as e:
146
+ print(f" Error adding to collection: {e}")
147
 
148
  def search(self, query: str, n_results: int = 5) -> List[Dict]:
149
  """Search vector store for similar documents"""
 
155
  n_results=n_results
156
  )
157
 
158
+ # Format results
159
  formatted_results = []
160
  if results['documents']:
161
  for i, doc in enumerate(results['documents'][0]):
 
175
  return []
176
 
177
  def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
178
+ """Split text into chunks with overlap"""
179
  chunks = []
180
  start = 0
181
  while start < len(text):
 
185
  return chunks
186
 
187
  def get_collection_info(self) -> Dict:
188
+ """Get information about the collection"""
189
  try:
190
  count = self.collection.count()
191
  return {
 
199
  return {'status': 'error', 'message': str(e)}
200
 
201
  def delete_by_doc_id(self, doc_id: str):
202
+ """Delete all documents related to a specific doc_id"""
203
  try:
204
+ # Get all IDs with this doc_id
205
  results = self.collection.get(where={'doc_id': doc_id})
206
  if results['ids']:
207
  self.collection.delete(ids=results['ids'])
208
+ print(f" Deleted {len(results['ids'])} documents for {doc_id}")
209
+ # Auto-persist on delete
210
+ print(f"✅ Changes persisted automatically")
211
  except Exception as e:
212
  print(f"Error deleting documents: {e}")
213
 
214
  def persist(self):
215
+ """
216
+ No-op for compatibility with older code.
217
+ ChromaDB v0.4.22+ uses PersistentClient which auto-persists.
218
+ This method kept for backward compatibility.
219
+ """
220
+ print("✅ Vector store is using auto-persist (no manual persist needed)")
221
 
222
  def clear_all(self):
223
+ """Clear all documents from collection"""
224
  try:
225
+ # Delete collection and recreate
226
  self.client.delete_collection(name="multimodal_rag")
227
  self.collection = self.client.get_or_create_collection(
228
  name="multimodal_rag",
229
  metadata={"hnsw:space": "cosine"}
230
  )
231
+ print(" Collection cleared and reset")
232
  except Exception as e:
233
  print(f"Error clearing collection: {e}")