dnj0 commited on
Commit
b802cc4
·
1 Parent(s): e00eeca
Files changed (5) hide show
  1. src/app.py +77 -209
  2. src/config.py +10 -18
  3. src/pdf_parser.py +26 -48
  4. src/rag_system.py +89 -187
  5. src/vector_store.py +26 -50
src/app.py CHANGED
@@ -1,33 +1,25 @@
1
  """
2
- Multimodal RAG LLM System - Streamlit App
3
- Complete working version with VISUAL image analysis using gpt-4o
4
  """
5
 
6
  import streamlit as st
7
  import os
8
  from pathlib import Path
9
 
10
- # Import optimized versions
11
  from pdf_parser import PDFParser
12
  from vector_store import VectorStore
13
- from rag_system import VisualMultimodalRAG # NEW - Vision model
14
  from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
15
 
16
 
17
- # ============================================================================
18
- # PAGE CONFIGURATION
19
- # ============================================================================
20
 
21
  st.set_page_config(
22
- page_title="📄 Multimodal RAG LLM System",
23
- page_icon="🤖",
24
  layout="wide",
25
  initial_sidebar_state="expanded"
26
  )
27
 
28
- # ============================================================================
29
- # SESSION STATE INITIALIZATION
30
- # ============================================================================
31
 
32
  if 'api_key_set' not in st.session_state:
33
  st.session_state.api_key_set = False
@@ -35,7 +27,7 @@ if 'api_key_set' not in st.session_state:
35
  if 'api_key' not in st.session_state:
36
  st.session_state.api_key = None
37
 
38
- if 'visual_rag_system' not in st.session_state: # NEW - Vision model
39
  st.session_state.visual_rag_system = None
40
 
41
  if 'vector_store' not in st.session_state:
@@ -56,39 +48,28 @@ if 'current_images' not in st.session_state:
56
  if 'current_tables' not in st.session_state:
57
  st.session_state.current_tables = None
58
 
59
- if 'processing_results' not in st.session_state: # NEW
60
  st.session_state.processing_results = None
61
 
62
  if 'answering_rag' not in st.session_state:
63
  st.session_state.answering_rag = None
64
 
65
 
66
- # ============================================================================
67
- # MAIN HEADER
68
- # ============================================================================
69
 
70
- st.title("📄 Multimodal RAG LLM System")
71
  st.markdown("""
72
- Process PDF documents with visual image analysis:
73
- - **PDF Parser** with OCR for Russian & English
74
- - **Visual Analysis** (gpt-4o) for image understanding
75
- - **Vector Store** (ChromaDB) for semantic search
76
- - **Individual Component** summarization and storage
77
  """)
78
 
79
 
80
- # ============================================================================
81
- # SIDEBAR - CONFIGURATION
82
- # ============================================================================
83
 
84
  with st.sidebar:
85
- st.header("⚙️ Configuration")
86
 
87
- # API Key Section
88
- st.subheader("🔑 OpenAI API Key")
89
 
90
  api_key = st.text_input(
91
- "Enter your OpenAI API key:",
92
  type="password",
93
  key="api_key_input"
94
  )
@@ -97,62 +78,53 @@ with st.sidebar:
97
  st.session_state.api_key = api_key
98
  st.session_state.api_key_set = True
99
 
100
- # Initialize RAG systems if not already done
101
  if st.session_state.visual_rag_system is None:
102
  try:
103
  st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW
104
  st.session_state.vector_store = VectorStore()
105
  st.session_state.parser = PDFParser(debug=True)
106
- st.success("API Key set & systems initialized")
107
  except Exception as e:
108
- st.error(f"Error initializing systems: {e}")
109
  else:
110
  st.session_state.api_key_set = False
111
- st.warning("⚠️ Please enter your API key to continue")
112
 
113
  st.divider()
114
 
115
- # Vector Store Status
116
- st.subheader("📊 Vector Store Status")
117
  if st.session_state.vector_store:
118
  try:
119
  info = st.session_state.vector_store.get_collection_info()
120
- st.metric("Items in Store", info['count'])
121
- st.metric("Status", info['status'])
122
- st.caption(f"Path: {info['persist_path']}")
123
  except Exception as e:
124
- st.error(f"Error getting store info: {e}")
125
  else:
126
- st.info("Set API key to initialize vector store")
127
 
128
  st.divider()
129
 
130
- # Document Management
131
- st.subheader("📁 Document Management")
132
- if st.button("🔄 Clear Vector Store"):
133
  if st.session_state.vector_store:
134
  try:
135
  st.session_state.vector_store.clear_all()
136
- st.success(" Vector store cleared")
137
  except Exception as e:
138
- st.error(f"Error clearing store: {e}")
139
 
140
 
141
- # ============================================================================
142
- # MAIN CONTENT
143
- # ============================================================================
144
 
145
- # Upload Section
146
- st.header("📤 Upload PDF Document")
147
 
148
  uploaded_file = st.file_uploader(
149
- "Choose a PDF file",
150
  type=['pdf'],
151
- help="PDF with text, images, and tables"
152
  )
153
 
154
  if uploaded_file is not None:
155
- # Save uploaded file
156
  upload_path = Path(UPLOAD_FOLDER)
157
  upload_path.mkdir(exist_ok=True)
158
 
@@ -160,91 +132,64 @@ if uploaded_file is not None:
160
  with open(file_path, 'wb') as f:
161
  f.write(uploaded_file.getbuffer())
162
 
163
- st.success(f" File saved: {uploaded_file.name}")
164
 
165
- # Parse PDF
166
- if st.button("🔍 Parse PDF"):
167
  if not st.session_state.api_key_set:
168
- st.error(" Please set OpenAI API key first")
169
  else:
170
  try:
171
- with st.spinner("📄 Parsing PDF..."):
172
- print(f"\n{'='*70}")
173
- print(f"PARSING: {uploaded_file.name}")
174
- print(f"{'='*70}")
175
 
176
- # Parse PDF - returns text, images, tables
177
  parser = st.session_state.parser
178
  text, images, tables = parser.parse_pdf(str(file_path))
179
 
180
- # Store in session state
181
  st.session_state.current_document = uploaded_file.name
182
  st.session_state.current_text = text
183
  st.session_state.current_images = images
184
  st.session_state.current_tables = tables
185
 
186
- # Display results
187
  col1, col2, col3 = st.columns(3)
188
  with col1:
189
- st.metric("📝 Text", f"{len(text):,} chars")
190
  with col2:
191
- st.metric("🖼️ Images", len(images))
192
  with col3:
193
- st.metric("📋 Tables", len(tables))
194
 
195
- # Show image OCR details
196
- if images:
197
- st.subheader("🖼️ Extracted Images")
198
- for idx, img in enumerate(images):
199
- ocr_text = img.get('ocr_text', '')
200
- ocr_len = len(ocr_text)
201
-
202
- if ocr_len > 0:
203
- st.success(f"✅ Image {idx}: {ocr_len} characters (OCR)")
204
- else:
205
- st.warning(f"⚠️ Image {idx}: No OCR text (will use visual analysis)")
206
-
207
- st.success("✅ PDF parsing complete!")
208
 
209
  except Exception as e:
210
- st.error(f" Error parsing PDF: {e}")
211
- print(f"Error: {e}")
212
 
213
 
214
- # ============================================================================
215
- # VISUAL IMAGE ANALYSIS & COMPONENT STORAGE
216
- # ============================================================================
217
 
218
  st.divider()
219
- st.header("🖼️ Visual Analysis & Storage")
220
 
221
  st.info("""
222
- **How it works:**
223
- 1. Images are sent to gpt-4o for visual analysis (not just text OCR)
224
- 2. Text is split into chunks and each chunk is summarized
225
- 3. Tables are analyzed individually
226
- 4. ALL summaries are stored in the vector store for semantic search
227
  """)
228
 
229
- if st.button("🖼️ Analyze Images Visually & Store Components"):
230
  if not st.session_state.api_key_set:
231
- st.error(" Please set OpenAI API key first")
232
  elif st.session_state.current_text is None:
233
- st.error(" Please parse a PDF document first")
234
  else:
235
  try:
236
- with st.spinner("🖼️ Analyzing images visually with gpt-4o..."):
237
- print(f"\n{'='*70}")
238
- print(f"VISUAL IMAGE ANALYSIS")
239
- print(f"{'='*70}")
240
 
241
- # Process with visual analysis
242
  visual_rag = st.session_state.visual_rag_system
243
  vector_store = st.session_state.vector_store
244
 
245
  results = visual_rag.process_and_store_document(
246
  text=st.session_state.current_text,
247
- images=st.session_state.current_images, # Actual images sent to gpt-4o
248
  tables=st.session_state.current_tables,
249
  vector_store=vector_store,
250
  doc_id=st.session_state.current_document or "current_doc"
@@ -252,107 +197,55 @@ if st.button("🖼️ Analyze Images Visually & Store Components"):
252
 
253
  st.session_state.processing_results = results
254
 
255
- # Display results
256
- st.success("✅ Visual analysis complete & stored!")
257
 
258
  col1, col2, col3 = st.columns(3)
259
  with col1:
260
- st.metric("🖼️ Images Analyzed", len(results['image_visual_analyses']))
261
  with col2:
262
- st.metric("📝 Text Chunks", len(results['text_summaries']))
263
  with col3:
264
- st.metric("📋 Tables Analyzed", len(results['table_summaries']))
265
-
266
- st.metric("📊 Total Stored in Vector", results['total_stored'])
267
-
268
- # Show image visual analyses
269
- if results['image_visual_analyses']:
270
- st.subheader("🖼️ Visual Image Analyses (gpt-4o)")
271
- for img_analysis in results['image_visual_analyses']:
272
- with st.expander(f"Image {img_analysis['image_index']} - Visual Analysis"):
273
- st.write("**Visual Analysis by gpt-4o:**")
274
- st.write(img_analysis['visual_analysis'])
275
-
276
- st.write("**Image Path:**")
277
- st.code(img_analysis['image_path'])
278
-
279
- if img_analysis['ocr_text']:
280
- st.write("**OCR Text (backup):**")
281
- st.text(img_analysis['ocr_text'][:500])
282
-
283
- # Show text chunk summaries
284
- if results['text_summaries']:
285
- st.subheader("📝 Text Chunk Summaries")
286
- for chunk_summary in results['text_summaries']:
287
- with st.expander(
288
- f"Chunk {chunk_summary['chunk_index']} "
289
- f"({chunk_summary['chunk_length']} chars)"
290
- ):
291
- st.write("**Summary:**")
292
- st.write(chunk_summary['summary'])
293
- st.write("**Original Text (first 500 chars):**")
294
- st.text(chunk_summary['original_text'])
295
 
296
- # Show table analyses
297
- if results['table_summaries']:
298
- st.subheader("📋 Table Analyses")
299
- for table_summary in results['table_summaries']:
300
- with st.expander(
301
- f"Table {table_summary['table_index']} "
302
- f"({table_summary['table_length']} chars)"
303
- ):
304
- st.write("**Analysis:**")
305
- st.write(table_summary['summary'])
306
- st.write("**Original Content (first 500 chars):**")
307
- st.text(table_summary['original_content'])
308
 
309
- print(f"\n✅ Visual analysis processing complete!")
310
 
311
  except Exception as e:
312
- st.error(f" Error during visual analysis: {e}")
313
- print(f"Error: {e}")
314
 
315
 
316
- # ============================================================================
317
- # QUESTION & ANSWERING
318
- # ============================================================================
319
 
320
  st.divider()
321
- st.header(" Ask Questions About Document")
322
 
323
- # Initialize answering system if not done
324
  if 'answering_rag' not in st.session_state:
325
  st.session_state.answering_rag = None
326
 
327
- # Create answering system when API key is set
328
  if st.session_state.api_key_set and st.session_state.answering_rag is None:
329
  from rag_system import AnsweringRAG
330
  st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
331
 
332
  question = st.text_area(
333
- "Enter your question:",
334
  height=100,
335
- placeholder="What does the document say about...?"
336
  )
337
 
338
- if st.button("🔍 Search & Generate Answer"):
339
  if not st.session_state.api_key_set:
340
- st.error(" Please set OpenAI API key first")
341
  elif st.session_state.current_text is None:
342
- st.error(" Please parse a PDF document first")
343
  elif not question:
344
- st.error(" Please enter a question")
345
  else:
346
  try:
347
- with st.spinner("🔄 Searching document and analyzing..."):
348
- print(f"\n{'='*70}")
349
- print(f"QUESTION: {question}")
350
- print(f"{'='*70}")
351
-
352
- # Search vector store
353
  store = st.session_state.vector_store
354
 
355
- # Add documents to store if needed
356
  doc_name = st.session_state.current_document or "current_doc"
357
  doc_data = {
358
  'text': st.session_state.current_text,
@@ -361,21 +254,17 @@ if st.button("🔍 Search & Generate Answer"):
361
  }
362
  store.add_documents(doc_data, doc_name)
363
 
364
- # Search for relevant results
365
  search_results = store.search(question, n_results=5)
366
 
367
- print(f"\n📊 Search Results Found: {len(search_results)}")
368
 
369
- # Analyze results and generate answer
370
  answering_rag = st.session_state.answering_rag
371
  result = answering_rag.analyze_and_answer(question, search_results)
372
 
373
- # Display answer prominently
374
- st.success("✅ Analysis complete!")
375
 
376
- st.subheader("📝 Answer")
377
 
378
- # Show confidence level
379
  col1, col2, col3 = st.columns(3)
380
  with col1:
381
  confidence_color = {
@@ -383,56 +272,35 @@ if st.button("🔍 Search & Generate Answer"):
383
  'medium': '🟡',
384
  'low': '🔴'
385
  }.get(result['confidence'], '⚪')
386
- st.metric("Confidence", f"{confidence_color} {result['confidence'].upper()}")
387
  with col2:
388
- st.metric("Sources Used", result['sources_used'])
389
  with col3:
390
  if result['sources_used'] > 0:
391
- st.metric("Avg Relevance", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
392
 
393
- # Display the generated answer
394
  st.write(result['answer'])
395
 
396
- # Show sources
397
- if st.checkbox("📚 Show Source Documents"):
398
- st.subheader("Sources Used in Answer")
399
  for idx, source in enumerate(result['formatted_sources'], 1):
400
  relevance = source['relevance']
401
- relevance_bar = "" * int(relevance * 10) + "" * (10 - int(relevance * 10))
402
 
403
  with st.expander(
404
- f"Source {idx} - {source['type'].upper()} "
405
  f"[{relevance_bar}] {relevance:.0%}"
406
  ):
407
  st.write(source['content'])
408
 
409
- print(f"\n✅ Answer generation complete!")
410
 
411
  except Exception as e:
412
- st.error(f" Error processing question: {e}")
413
- print(f"Error: {e}")
414
-
415
-
416
- # ============================================================================
417
- # FOOTER
418
- # ============================================================================
419
 
420
  st.divider()
421
 
422
- col1, col2, col3 = st.columns(3)
423
-
424
- with col1:
425
- st.info("📖 **Text Processing**: PyPDF2 extraction with UTF-8 support")
426
-
427
- with col2:
428
- st.info("🖼️ **Visual Analysis**: GPT-4o vision for image understanding")
429
-
430
- with col3:
431
- st.info("📊 **Vector Storage**: ChromaDB with auto-persist")
432
-
433
  st.caption(
434
- "Multimodal RAG System | "
435
- "Visual Image Analysis | "
436
- "Russian Language Support | "
437
- "Individual Component Summarization"
438
  )
 
1
  """
2
+ UI RAG
 
3
  """
4
 
5
  import streamlit as st
6
  import os
7
  from pathlib import Path
8
 
 
9
  from pdf_parser import PDFParser
10
  from vector_store import VectorStore
11
+ from rag_system import VisualMultimodalRAG
12
  from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
13
 
14
 
 
 
 
15
 
16
  st.set_page_config(
17
+ page_title="Мультимодальная RAG система (PDF parsing)",
 
18
  layout="wide",
19
  initial_sidebar_state="expanded"
20
  )
21
 
22
+
 
 
23
 
24
  if 'api_key_set' not in st.session_state:
25
  st.session_state.api_key_set = False
 
27
  if 'api_key' not in st.session_state:
28
  st.session_state.api_key = None
29
 
30
+ if 'visual_rag_system' not in st.session_state:
31
  st.session_state.visual_rag_system = None
32
 
33
  if 'vector_store' not in st.session_state:
 
48
  if 'current_tables' not in st.session_state:
49
  st.session_state.current_tables = None
50
 
51
+ if 'processing_results' not in st.session_state:
52
  st.session_state.processing_results = None
53
 
54
  if 'answering_rag' not in st.session_state:
55
  st.session_state.answering_rag = None
56
 
57
 
 
 
 
58
 
59
+ st.title("Мультимодальная RAG система (PDF parsing)")
60
  st.markdown("""
61
+ Обрабатывает PDF документы и предоставляет информацию по ним
 
 
 
 
62
  """)
63
 
64
 
 
 
 
65
 
66
  with st.sidebar:
67
+ st.header(" Конфигурация")
68
 
69
+ st.subheader(" OpenAI API Ключ")
 
70
 
71
  api_key = st.text_input(
72
+ "Введите OpenAI API ключ:",
73
  type="password",
74
  key="api_key_input"
75
  )
 
78
  st.session_state.api_key = api_key
79
  st.session_state.api_key_set = True
80
 
 
81
  if st.session_state.visual_rag_system is None:
82
  try:
83
  st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW
84
  st.session_state.vector_store = VectorStore()
85
  st.session_state.parser = PDFParser(debug=True)
86
+ st.success("API ключ введен")
87
  except Exception as e:
88
+ st.error(f"Ошибка старта системы: {e}")
89
  else:
90
  st.session_state.api_key_set = False
91
+ st.warning("Введите OpenAI API ключ")
92
 
93
  st.divider()
94
 
95
+ st.subheader("Векторное хранилище")
 
96
  if st.session_state.vector_store:
97
  try:
98
  info = st.session_state.vector_store.get_collection_info()
99
+ st.metric("Документов в хранилище", info['count'])
100
+ st.caption(f"Расположение: {info['persist_path']}")
 
101
  except Exception as e:
102
+ st.error(f"Ошибка получения информации: {e}")
103
  else:
104
+ st.info("Введите OpenAI API ключ")
105
 
106
  st.divider()
107
 
108
+ st.subheader("Управление хранилищем")
109
+ if st.button("Очистить хранилище"):
 
110
  if st.session_state.vector_store:
111
  try:
112
  st.session_state.vector_store.clear_all()
113
+ st.success("Хранилище очищено")
114
  except Exception as e:
115
+ st.error(f"Ошибка очистки хранилища: {e}")
116
 
117
 
 
 
 
118
 
119
+ st.header("Загрузить PDF")
 
120
 
121
  uploaded_file = st.file_uploader(
122
+ "Выбрать...",
123
  type=['pdf'],
124
+ help="Загрузите PDF файл"
125
  )
126
 
127
  if uploaded_file is not None:
 
128
  upload_path = Path(UPLOAD_FOLDER)
129
  upload_path.mkdir(exist_ok=True)
130
 
 
132
  with open(file_path, 'wb') as f:
133
  f.write(uploaded_file.getbuffer())
134
 
135
+ st.success(f"Файл загружен: {uploaded_file.name}")
136
 
137
+ if st.button("Распарсить PDF"):
 
138
  if not st.session_state.api_key_set:
139
+ st.error("Введите OpenAI API ключ")
140
  else:
141
  try:
142
+ with st.spinner(" Парсинг PDF..."):
143
+
144
+ print(f"Парсинг PDF файла: {uploaded_file.name}")
145
+
146
 
 
147
  parser = st.session_state.parser
148
  text, images, tables = parser.parse_pdf(str(file_path))
149
 
 
150
  st.session_state.current_document = uploaded_file.name
151
  st.session_state.current_text = text
152
  st.session_state.current_images = images
153
  st.session_state.current_tables = tables
154
 
 
155
  col1, col2, col3 = st.columns(3)
156
  with col1:
157
+ st.metric("Текста", f"{len(text):,} chars")
158
  with col2:
159
+ st.metric("Изображений", len(images))
160
  with col3:
161
+ st.metric("Таблиц", len(tables))
162
 
163
+ st.success("Парсинг PDF завершен!")
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  except Exception as e:
166
+ st.error(f"Парсинг PDF завершелся с ошибкой: {e}")
167
+ print(f"Ошибка: {e}")
168
 
169
 
 
 
 
170
 
171
  st.divider()
172
+ st.header("Анализ документа")
173
 
174
  st.info("""
175
+ Отправляет содержимое документа на анализ
 
 
 
 
176
  """)
177
 
178
+ if st.button("Проанализировать документ"):
179
  if not st.session_state.api_key_set:
180
+ st.error("Введите OpenAI API ключ")
181
  elif st.session_state.current_text is None:
182
+ st.error("Распарсите документ")
183
  else:
184
  try:
185
+ with st.spinner("Анализ с gpt-4o-mini..."):
 
 
 
186
 
 
187
  visual_rag = st.session_state.visual_rag_system
188
  vector_store = st.session_state.vector_store
189
 
190
  results = visual_rag.process_and_store_document(
191
  text=st.session_state.current_text,
192
+ images=st.session_state.current_images,
193
  tables=st.session_state.current_tables,
194
  vector_store=vector_store,
195
  doc_id=st.session_state.current_document or "current_doc"
 
197
 
198
  st.session_state.processing_results = results
199
 
200
+ st.success("Анализ готов!")
 
201
 
202
  col1, col2, col3 = st.columns(3)
203
  with col1:
204
+ st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
205
  with col2:
206
+ st.metric("Проанализировано чанков текста", len(results['text_summaries']))
207
  with col3:
208
+ st.metric("Проанализировано таблиц", len(results['table_summaries']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
+ st.metric("Помещено в хранилище", results['total_stored'])
211
+
 
 
 
 
 
 
 
 
 
 
212
 
213
+ print(f"Анализ завершен")
214
 
215
  except Exception as e:
216
+ st.error(f"Ошибка в ходе: {e}")
217
+ print(f"Ошибка: {e}")
218
 
219
 
 
 
 
220
 
221
  st.divider()
222
+ st.header("Работа с документом")
223
 
 
224
  if 'answering_rag' not in st.session_state:
225
  st.session_state.answering_rag = None
226
 
 
227
  if st.session_state.api_key_set and st.session_state.answering_rag is None:
228
  from rag_system import AnsweringRAG
229
  st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
230
 
231
  question = st.text_area(
232
+ "Введите запрос:",
233
  height=100,
234
+ placeholder="О чем данный документ?"
235
  )
236
 
237
+ if st.button("Генерация ответа"):
238
  if not st.session_state.api_key_set:
239
+ st.error("Введите OpenAI API ключ")
240
  elif st.session_state.current_text is None:
241
+ st.error("Распарсите документ")
242
  elif not question:
243
+ st.error("Введите запрос")
244
  else:
245
  try:
246
+ with st.spinner("Поиск документов..."):
 
 
 
 
 
247
  store = st.session_state.vector_store
248
 
 
249
  doc_name = st.session_state.current_document or "current_doc"
250
  doc_data = {
251
  'text': st.session_state.current_text,
 
254
  }
255
  store.add_documents(doc_data, doc_name)
256
 
 
257
  search_results = store.search(question, n_results=5)
258
 
259
+ print(f"Найдено: {len(search_results)}")
260
 
 
261
  answering_rag = st.session_state.answering_rag
262
  result = answering_rag.analyze_and_answer(question, search_results)
263
 
264
+ st.success("Поиск завершен!")
 
265
 
266
+ st.subheader("Ответ")
267
 
 
268
  col1, col2, col3 = st.columns(3)
269
  with col1:
270
  confidence_color = {
 
272
  'medium': '🟡',
273
  'low': '🔴'
274
  }.get(result['confidence'], '⚪')
275
+ st.metric("Уверенность в ответе", f"{confidence_color} {result['confidence'].upper()}")
276
  with col2:
277
+ st.metric("Использовано источников", result['sources_used'])
278
  with col3:
279
  if result['sources_used'] > 0:
280
+ st.metric("Среднняя релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
281
 
 
282
  st.write(result['answer'])
283
 
284
+ if st.checkbox("Показать исходные документы"):
285
+ st.subheader("Использованы докуме��ты")
 
286
  for idx, source in enumerate(result['formatted_sources'], 1):
287
  relevance = source['relevance']
288
+ relevance_bar = "\/" * int(relevance * 10) + "|" * (10 - int(relevance * 10))
289
 
290
  with st.expander(
291
+ f"Источник {idx} - {source['type'].upper()} "
292
  f"[{relevance_bar}] {relevance:.0%}"
293
  ):
294
  st.write(source['content'])
295
 
296
+ print(f" Ответ готов!")
297
 
298
  except Exception as e:
299
+ st.error(f"Ошибка обработки запроса: {e}")
300
+ print(f"Ошибка: {e}")
 
 
 
 
 
301
 
302
  st.divider()
303
 
 
 
 
 
 
 
 
 
 
 
 
304
  st.caption(
305
+ "Мультимодальная RAG система для парсинга PDF документов"
 
 
 
306
  )
src/config.py CHANGED
@@ -1,42 +1,34 @@
1
  """
2
- Configuration file for Multimodal RAG LLM System
3
  """
4
  import os
5
  from pathlib import Path
6
 
7
- # API Configuration
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
9
- OPENAI_MODEL = "gpt-4o-mini" # Cheaper model variant
10
- USE_CACHE = True # Enable response caching
11
 
12
- # Vector Store Configuration
13
  CHROMA_DB_PATH = "./chroma_db"
14
  DOCSTORE_PATH = "./docstore"
15
  PROCESSED_FILES_LOG = "./processed_files.txt"
16
 
17
- # Embedding Model Configuration
18
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
19
  EMBEDDING_DIM = 768
20
 
21
- # System Configuration
22
- MAX_CHUNK_SIZE = 500 # Smaller chunks = fewer tokens
23
- CHUNK_OVERLAP = 50 # Less overlap = fewer chunks
24
- TEMPERATURE = 0.3 # Lower = faster, cheaper
25
- MAX_TOKENS = 500 # Limit response size (vs 1500)
26
 
27
- # Language Support
28
  LANGUAGE = "russian"
29
 
30
- # Create necessary directories
31
  Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
32
  Path(DOCSTORE_PATH).mkdir(exist_ok=True)
33
 
34
- # PDF Upload Configuration
35
  UPLOAD_FOLDER = "./uploaded_pdfs"
36
  Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
37
  MAX_PDF_SIZE_MB = 50
38
 
39
- # TOKEN OPTIMIZATION SETTINGS
40
- BATCH_SEARCH_RESULTS = 3 # Return only top 3 (not 5)
41
- CACHE_RESPONSES = True # Cache Q&A responses
42
- SUMMARIZE_FIRST = True # Summarize PDFs once, not per query
 
1
  """
2
+ Конфигурационный файл
3
  """
4
  import os
5
  from pathlib import Path
6
 
 
7
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
8
+ OPENAI_MODEL = "gpt-4o-mini"
9
+ USE_CACHE = True
10
 
 
11
  CHROMA_DB_PATH = "./chroma_db"
12
  DOCSTORE_PATH = "./docstore"
13
  PROCESSED_FILES_LOG = "./processed_files.txt"
14
 
 
15
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
16
  EMBEDDING_DIM = 768
17
 
18
+ MAX_CHUNK_SIZE = 500
19
+ CHUNK_OVERLAP = 50
20
+ TEMPERATURE = 0.3
21
+ MAX_TOKENS = 500
 
22
 
 
23
  LANGUAGE = "russian"
24
 
 
25
  Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
26
  Path(DOCSTORE_PATH).mkdir(exist_ok=True)
27
 
 
28
  UPLOAD_FOLDER = "./uploaded_pdfs"
29
  Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
30
  MAX_PDF_SIZE_MB = 50
31
 
32
+ BATCH_SEARCH_RESULTS = 3
33
+ CACHE_RESPONSES = True
34
+ SUMMARIZE_FIRST = True
 
src/pdf_parser.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- PDF Parser Module with FIXED Russian OCR support
3
  """
4
  import os
5
  import json
@@ -20,27 +20,14 @@ class PDFParser:
20
  self.processed_files = self._load_processed_files()
21
  self.debug = debug
22
 
23
- # Configure Tesseract for Russian + English
24
  self._configure_tesseract()
25
 
26
  if self.debug:
27
- print("PDFParser initialized with Russian OCR support")
28
-
29
- def _configure_tesseract(self):
30
- """Configure Tesseract with proper paths and language support"""
31
- try:
32
- # Windows specific path
33
- if os.name == 'nt':
34
- pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
35
-
36
- # Test Tesseract
37
- pytesseract.get_tesseract_version()
38
- print("✅ Tesseract configured successfully")
39
- except Exception as e:
40
- print(f"⚠️ Tesseract configuration warning: {e}")
41
 
42
  def _debug_print(self, label: str, data: any):
43
- """Print debug information"""
44
  if self.debug:
45
  print(f"\n🔍 [PDF Parser] {label}")
46
  if isinstance(data, dict):
@@ -54,7 +41,7 @@ class PDFParser:
54
  print(f" {data}")
55
 
56
  def _load_processed_files(self) -> Dict[str, str]:
57
- """Load list of already processed files with their hashes"""
58
  if os.path.exists(PROCESSED_FILES_LOG):
59
  try:
60
  with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -64,12 +51,12 @@ class PDFParser:
64
  return {}
65
 
66
  def _save_processed_files(self):
67
- """Save processed files list to disk"""
68
  with open(PROCESSED_FILES_LOG, 'w') as f:
69
  json.dump(self.processed_files, f, indent=2)
70
 
71
  def _get_file_hash(self, file_path: str) -> str:
72
- """Generate hash of file to detect changes"""
73
  hash_md5 = hashlib.md5()
74
  with open(file_path, "rb") as f:
75
  for chunk in iter(lambda: f.read(4096), b""):
@@ -77,7 +64,7 @@ class PDFParser:
77
  return hash_md5.hexdigest()
78
 
79
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
80
- """Extract text from PDF using PyPDF2"""
81
  text = ""
82
  try:
83
  with open(pdf_path, 'rb') as file:
@@ -96,40 +83,36 @@ class PDFParser:
96
  return text
97
 
98
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
99
- """Extract images from PDF pages with Russian OCR support"""
100
  images_data = []
101
  try:
102
- self._debug_print("Image Extraction Started", f"File: {pdf_path}")
103
 
104
  images = convert_from_path(pdf_path, dpi=150)
105
- self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
106
 
107
  for idx, image in enumerate(images):
108
- self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
109
 
110
- # Save image
111
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
112
  image.save(image_path)
113
  self._debug_print(f"Image {idx} Saved", str(image_path))
114
 
115
- # Extract text using OCR with Russian support
116
- self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
117
 
118
  try:
119
- # CRITICAL: Use 'rus+eng' for Russian + English support
120
  ocr_text = pytesseract.image_to_string(image, lang='rus')
121
 
122
- # Clean up text
123
  ocr_text = ocr_text.strip()
124
 
125
  if not ocr_text or len(ocr_text) < 5:
126
- self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
127
  else:
128
- self._debug_print(f"Image {idx} OCR Result", f" Success - {len(ocr_text)} chars: {ocr_text[:150]}")
129
 
130
  except Exception as ocr_error:
131
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
132
- ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
133
 
134
  images_data.append({
135
  'page': idx,
@@ -144,13 +127,13 @@ class PDFParser:
144
  return images_data
145
 
146
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
147
- """Extract table content from PDF"""
148
  tables_data = []
149
  try:
150
  text = self._extract_text_from_pdf(pdf_path)
151
  lines = text.split('\n')
152
 
153
- self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
154
 
155
  current_table = []
156
  for line in lines:
@@ -177,44 +160,39 @@ class PDFParser:
177
  return tables_data
178
 
179
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
180
- """Parse PDF and extract text, images, and tables with debug output"""
181
  file_hash = self._get_file_hash(pdf_path)
182
  doc_id = Path(pdf_path).stem
183
 
184
- self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
185
 
186
- # Check if file was already processed
187
  if doc_id in self.processed_files:
188
  if self.processed_files[doc_id] == file_hash:
189
- self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
190
  return self._load_extracted_data(doc_id)
191
 
192
- print(f"\n📄 Processing PDF: {doc_id}")
193
 
194
- # Extract content
195
  text = self._extract_text_from_pdf(pdf_path)
196
  images = self._extract_images_from_pdf(pdf_path, doc_id)
197
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
198
 
199
- # Summary
200
- self._debug_print("Extraction Summary", {
201
  'text_length': len(text),
202
  'images_count': len(images),
203
  'tables_count': len(tables),
204
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
205
  })
206
 
207
- # Save extracted data
208
  self._save_extracted_data(doc_id, text, images, tables)
209
 
210
- # Update processed files log
211
  self.processed_files[doc_id] = file_hash
212
  self._save_processed_files()
213
 
214
  return text, images, tables
215
 
216
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
217
- """Save extracted data to docstore"""
218
  data = {
219
  'text': text,
220
  'images': images,
@@ -227,7 +205,7 @@ class PDFParser:
227
  self._debug_print("Data Saved", str(data_path))
228
 
229
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
230
- """Load previously extracted data from docstore"""
231
  data_path = self.docstore_path / f"{doc_id}_data.json"
232
  try:
233
  with open(data_path, 'r', encoding='utf-8') as f:
@@ -237,7 +215,7 @@ class PDFParser:
237
  return "", [], []
238
 
239
  def get_all_documents(self) -> Dict:
240
- """Load all processed documents from docstore"""
241
  all_docs = {}
242
  for json_file in self.docstore_path.glob("*_data.json"):
243
  doc_id = json_file.stem.replace("_data", "")
 
1
  """
2
+ PDF Парсер
3
  """
4
  import os
5
  import json
 
20
  self.processed_files = self._load_processed_files()
21
  self.debug = debug
22
 
23
+
24
  self._configure_tesseract()
25
 
26
  if self.debug:
27
+ print("PDFParser initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def _debug_print(self, label: str, data: any):
30
+ """Debug"""
31
  if self.debug:
32
  print(f"\n🔍 [PDF Parser] {label}")
33
  if isinstance(data, dict):
 
41
  print(f" {data}")
42
 
43
  def _load_processed_files(self) -> Dict[str, str]:
44
+ """Подгрузка обработанных файлов"""
45
  if os.path.exists(PROCESSED_FILES_LOG):
46
  try:
47
  with open(PROCESSED_FILES_LOG, 'r') as f:
 
51
  return {}
52
 
53
  def _save_processed_files(self):
54
+ """Сохранение обработанных файлов"""
55
  with open(PROCESSED_FILES_LOG, 'w') as f:
56
  json.dump(self.processed_files, f, indent=2)
57
 
58
  def _get_file_hash(self, file_path: str) -> str:
59
+ """Проверка изменения файлов"""
60
  hash_md5 = hashlib.md5()
61
  with open(file_path, "rb") as f:
62
  for chunk in iter(lambda: f.read(4096), b""):
 
64
  return hash_md5.hexdigest()
65
 
66
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
67
+ """Извлечение текста из PDF"""
68
  text = ""
69
  try:
70
  with open(pdf_path, 'rb') as file:
 
83
  return text
84
 
85
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
86
+ """Извлечение изображений из PDF"""
87
  images_data = []
88
  try:
89
+ self._debug_print("Image extraction", f"File: {pdf_path}")
90
 
91
  images = convert_from_path(pdf_path, dpi=150)
92
+ self._debug_print(f"Total images: {len(images)}")
93
 
94
  for idx, image in enumerate(images):
95
+ self._debug_print(f"Image {idx}", f"Size: {image.size}")
96
 
 
97
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
98
  image.save(image_path)
99
  self._debug_print(f"Image {idx} Saved", str(image_path))
100
 
101
+ self._debug_print(f"Image {idx} OCR", "Running OCR...")
 
102
 
103
  try:
 
104
  ocr_text = pytesseract.image_to_string(image, lang='rus')
105
 
 
106
  ocr_text = ocr_text.strip()
107
 
108
  if not ocr_text or len(ocr_text) < 5:
109
+ self._debug_print(f"Image {idx} OCR Result", f"WARN ({len(ocr_text)} chars)")
110
  else:
111
+ self._debug_print(f"Image {idx} OCR Result", f"SUCCESS {len(ocr_text)} chars: {ocr_text[:150]}")
112
 
113
  except Exception as ocr_error:
114
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
115
+ ocr_text = f"[Image {idx}: OCR failed {str(ocr_error)}]"
116
 
117
  images_data.append({
118
  'page': idx,
 
127
  return images_data
128
 
129
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
130
+ """Извлечение таблиц из PDF"""
131
  tables_data = []
132
  try:
133
  text = self._extract_text_from_pdf(pdf_path)
134
  lines = text.split('\n')
135
 
136
+ self._debug_print("Table extraction", f"Scanning {len(lines)} lines")
137
 
138
  current_table = []
139
  for line in lines:
 
160
  return tables_data
161
 
162
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
163
+ """Парсинг PDF"""
164
  file_hash = self._get_file_hash(pdf_path)
165
  doc_id = Path(pdf_path).stem
166
 
167
+ self._debug_print("PDF Parsing Started", f"File: {doc_id}")
168
 
 
169
  if doc_id in self.processed_files:
170
  if self.processed_files[doc_id] == file_hash:
171
+ self._debug_print("Status", f"File {doc_id} already processed")
172
  return self._load_extracted_data(doc_id)
173
 
174
+ print(f"\nProcessing PDF: {doc_id}")
175
 
 
176
  text = self._extract_text_from_pdf(pdf_path)
177
  images = self._extract_images_from_pdf(pdf_path, doc_id)
178
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
179
 
180
+ self._debug_print("Summary", {
 
181
  'text_length': len(text),
182
  'images_count': len(images),
183
  'tables_count': len(tables),
184
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
185
  })
186
 
 
187
  self._save_extracted_data(doc_id, text, images, tables)
188
 
 
189
  self.processed_files[doc_id] = file_hash
190
  self._save_processed_files()
191
 
192
  return text, images, tables
193
 
194
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
195
+ """Сохранение извелеченных данных в Docstore"""
196
  data = {
197
  'text': text,
198
  'images': images,
 
205
  self._debug_print("Data Saved", str(data_path))
206
 
207
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
208
+ """Подгрузка ранее извлеченных данных из Docstore"""
209
  data_path = self.docstore_path / f"{doc_id}_data.json"
210
  try:
211
  with open(data_path, 'r', encoding='utf-8') as f:
 
215
  return "", [], []
216
 
217
  def get_all_documents(self) -> Dict:
218
+ """Получение всех документов из Docstore"""
219
  all_docs = {}
220
  for json_file in self.docstore_path.glob("*_data.json"):
221
  doc_id = json_file.stem.replace("_data", "")
src/rag_system.py CHANGED
@@ -1,7 +1,5 @@
1
  """
2
- Enhanced RAG System - Visual Image Analysis
3
- Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
4
- Then stores results in vector store
5
  """
6
  from typing import List, Dict
7
  from langchain_openai import ChatOpenAI
@@ -17,20 +15,18 @@ from config import (
17
 
18
  class VisualMultimodalRAG:
19
  """
20
- RAG system that:
21
- 1. Sends images as base64 to GPT-4o for visual analysis
22
- 2. Gets detailed visual descriptions and insights
23
- 3. Stores visual analysis in vector store
24
- 4. Enables image-based semantic search
25
  """
26
 
27
  def __init__(self, api_key: str = None, debug: bool = True):
28
  api_key = api_key or OPENAI_API_KEY
29
  self.debug = debug
30
 
31
- # Use gpt-4o for vision capabilities
32
  self.llm = ChatOpenAI(
33
- model_name="gpt-4o-mini", # CRITICAL: gpt-4o has vision
34
  api_key=api_key,
35
  temperature=TEMPERATURE,
36
  max_tokens=MAX_TOKENS,
@@ -40,12 +36,12 @@ class VisualMultimodalRAG:
40
  self.visual_summaries_log = []
41
 
42
  if self.debug:
43
- print("VisualMultimodalRAG initialized with gpt-4o (vision model)")
44
 
45
  def _debug_print(self, label: str, data: any):
46
- """Print debug information"""
47
  if self.debug:
48
- print(f"\n🔍 DEBUG [{label}]:")
49
  if isinstance(data, (list, dict)):
50
  print(f" Type: {type(data).__name__}")
51
  print(f" Content: {str(data)[:300]}...")
@@ -53,7 +49,7 @@ class VisualMultimodalRAG:
53
  print(f" {data}")
54
 
55
  def _image_to_base64(self, image_path: str) -> str:
56
- """Convert image file to base64 string"""
57
  try:
58
  with open(image_path, 'rb') as image_file:
59
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
@@ -64,27 +60,16 @@ class VisualMultimodalRAG:
64
 
65
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
66
  """
67
- Send actual image (base64) to gpt-4o for visual analysis
68
- Returns detailed visual analysis/description
69
-
70
- gpt-4o can see:
71
- - Charts, graphs, diagrams
72
- - Tables and structured data
73
- - Photos and drawings
74
- - Handwritten text
75
- - Screenshots
76
- - Any visual content
77
  """
78
  if not os.path.exists(image_path):
79
  return f"[Image {image_idx}: File not found - {image_path}]"
80
 
81
  try:
82
- # Convert image to base64
83
  image_base64 = self._image_to_base64(image_path)
84
  if not image_base64:
85
- return f"[Image {image_idx}: Could not convert to base64]"
86
 
87
- # Determine image type
88
  file_ext = Path(image_path).suffix.lower()
89
  media_type_map = {
90
  '.jpg': 'image/jpeg',
@@ -95,9 +80,8 @@ class VisualMultimodalRAG:
95
  }
96
  media_type = media_type_map.get(file_ext, 'image/png')
97
 
98
- print(f"🔍 Analyzing image {image_idx} visually (as {media_type})...")
99
 
100
- # Create message with image
101
  message = HumanMessage(
102
  content=[
103
  {
@@ -108,41 +92,38 @@ class VisualMultimodalRAG:
108
  },
109
  {
110
  "type": "text",
111
- "text": f"""Analyze this image in detail in {self.language}.
112
 
113
- Provide a comprehensive visual analysis including:
114
- 1. **What you see** - Main objects, elements, structure
115
- 2. **Data/Content** - Any numbers, text, charts, graphs
116
- 3. **Purpose** - What this image is showing or representing
117
- 4. **Key insights** - Important patterns, trends, or information
118
- 5. **Connections** - How this relates to document content
119
 
120
- Be specific and detailed. Focus on visual information that cannot be extracted from text alone.
121
 
122
- Analysis:"""
123
  }
124
  ],
125
  )
126
 
127
- # Call gpt-4o with vision
128
  response = self.llm.invoke([message])
129
  analysis = response.content.strip()
130
 
131
  if self.debug:
132
  self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
133
 
134
- print(f" Image {image_idx} analyzed successfully")
135
  return analysis
136
 
137
  except Exception as e:
138
  error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
139
- print(f" Error analyzing image {image_idx}: {e}")
140
  return error_msg
141
 
142
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
143
  """
144
- Analyze each image visually using gpt-4o vision
145
- Returns list of {image_index, visual_analysis, type}
146
  """
147
  visual_analyses = []
148
 
@@ -150,10 +131,9 @@ Analysis:"""
150
  image_path = image.get('path', '')
151
 
152
  if not image_path:
153
- print(f"⚠️ Image {idx}: No path provided")
154
  continue
155
 
156
- # Analyze image visually (not just OCR)
157
  visual_analysis = self.analyze_image_visually(image_path, idx)
158
 
159
  visual_analyses.append({
@@ -161,14 +141,14 @@ Analysis:"""
161
  'image_index': idx,
162
  'image_path': image_path,
163
  'visual_analysis': visual_analysis,
164
- 'ocr_text': image.get('ocr_text', '') # Keep OCR as backup
165
  })
166
 
167
  return visual_analyses
168
 
169
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
170
  """
171
- Chunk text and summarize each chunk individually
172
  """
173
  chunks = []
174
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
@@ -180,13 +160,13 @@ Analysis:"""
180
  continue
181
 
182
  try:
183
- prompt = f"""Summarize this text chunk in {self.language}.
184
- Keep it concise. Extract key points, facts, and main ideas.
185
 
186
- Text Chunk:
187
  {chunk}
188
 
189
- Summary (2-3 sentences maximum):"""
190
 
191
  message = HumanMessage(content=prompt)
192
  response = self.llm.invoke([message])
@@ -210,7 +190,7 @@ Summary (2-3 sentences maximum):"""
210
 
211
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
212
  """
213
- Summarize each table individually
214
  """
215
  summaries = []
216
 
@@ -221,13 +201,13 @@ Summary (2-3 sentences maximum):"""
221
  continue
222
 
223
  try:
224
- prompt = f"""Analyze and summarize this table/structured data in {self.language}.
225
- Extract key insights, row/column meanings, and important figures.
226
 
227
- Table Content:
228
  {table_content}
229
 
230
- Summary (2-3 sentences maximum):"""
231
 
232
  message = HumanMessage(content=prompt)
233
  response = self.llm.invoke([message])
@@ -258,12 +238,10 @@ Summary (2-3 sentences maximum):"""
258
  doc_id: str
259
  ) -> Dict:
260
  """
261
- Main function: Analyze all components visually and store in vector store
262
- Images are analyzed using gpt-4o vision (not just OCR)
263
  """
264
- print(f"\n{'='*70}")
265
- print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
266
- print(f"{'='*70}")
267
 
268
  results = {
269
  'doc_id': doc_id,
@@ -273,14 +251,12 @@ Summary (2-3 sentences maximum):"""
273
  'total_stored': 0
274
  }
275
 
276
- # 1. Analyze images VISUALLY using gpt-4o
277
- print(f"\n🖼️ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
278
- print(f"{'─'*70}")
279
 
280
  image_analyses = self.analyze_images_visually(images)
281
  results['image_visual_analyses'] = image_analyses
282
 
283
- # Store each image analysis in vector store
284
  image_docs = {
285
  'text': ' | '.join([
286
  f"Image {a['image_index']}: {a['visual_analysis']}"
@@ -291,7 +267,7 @@ Summary (2-3 sentences maximum):"""
291
  }
292
 
293
  for analysis in image_analyses:
294
- print(f"Image {analysis['image_index']} (visual analysis)")
295
  print(f" Path: {analysis['image_path']}")
296
  print(f" Analysis: {analysis['visual_analysis'][:100]}...")
297
 
@@ -302,13 +278,11 @@ Summary (2-3 sentences maximum):"""
302
  f"{doc_id}_images_visual"
303
  )
304
  results['total_stored'] += len(image_analyses)
305
- print(f" Stored {len(image_analyses)} image visual analyses")
306
  except Exception as e:
307
- print(f"Error storing image analyses: {e}")
308
 
309
- # 2. Summarize and store text chunks
310
- print(f"\n📝 TEXT CHUNK SUMMARIZATION")
311
- print(f"{'─'*70}")
312
 
313
  text_summaries = self.summarize_text_chunks(text)
314
  results['text_summaries'] = text_summaries
@@ -321,7 +295,7 @@ Summary (2-3 sentences maximum):"""
321
  }
322
 
323
  for summary in text_summaries:
324
- print(f"Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
325
 
326
  if text_summaries:
327
  try:
@@ -330,13 +304,11 @@ Summary (2-3 sentences maximum):"""
330
  f"{doc_id}_text_chunks"
331
  )
332
  results['total_stored'] += len(text_summaries)
333
- print(f" Stored {len(text_summaries)} text chunk summaries")
334
  except Exception as e:
335
- print(f" Error storing text summaries: {e}")
336
 
337
- # 3. Summarize and store tables
338
- print(f"\n📋 TABLE SUMMARIZATION ({len(tables)} total)")
339
- print(f"{'─'*70}")
340
 
341
  table_summaries = self.summarize_tables(tables)
342
  results['table_summaries'] = table_summaries
@@ -349,7 +321,7 @@ Summary (2-3 sentences maximum):"""
349
  }
350
 
351
  for summary in table_summaries:
352
- print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
353
 
354
  if table_summaries:
355
  try:
@@ -358,25 +330,20 @@ Summary (2-3 sentences maximum):"""
358
  f"{doc_id}_tables"
359
  )
360
  results['total_stored'] += len(table_summaries)
361
- print(f" Stored {len(table_summaries)} table summaries")
362
  except Exception as e:
363
- print(f" Error storing table summaries: {e}")
364
-
365
- # 4. Summary statistics
366
- print(f"\n{'='*70}")
367
- print(f"📊 STORAGE SUMMARY")
368
- print(f"{'='*70}")
369
- print(f" Images analyzed visually & stored: {len(image_analyses)}")
370
- print(f" Text chunks summarized & stored: {len(text_summaries)}")
371
- print(f" Tables summarized & stored: {len(table_summaries)}")
372
  print(f" Total items stored in vector: {results['total_stored']}")
373
- print(f"{'='*70}")
374
 
375
  self.visual_summaries_log.append(results)
376
  return results
377
 
378
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
379
- """Split text into overlapping chunks"""
380
  chunks = []
381
  start = 0
382
  while start < len(text):
@@ -386,16 +353,15 @@ Summary (2-3 sentences maximum):"""
386
  return chunks
387
 
388
  def get_visual_summaries_log(self) -> List[Dict]:
389
- """Get all visual analysis logs"""
390
  return self.visual_summaries_log
391
 
392
 
393
  class AnsweringRAG:
394
  """
395
- RAG system that:
396
- 1. Searches vector store for relevant content
397
- 2. ANALYZES search results
398
- 3. Generates intelligent answers based on context
399
  """
400
 
401
  def __init__(self, api_key: str = None, debug: bool = True):
@@ -403,7 +369,7 @@ class AnsweringRAG:
403
  self.debug = debug
404
 
405
  self.llm = ChatOpenAI(
406
- model_name="gpt-4o-mini", # Use gpt-4o for better understanding
407
  api_key=api_key,
408
  temperature=TEMPERATURE,
409
  max_tokens=MAX_TOKENS,
@@ -413,10 +379,10 @@ class AnsweringRAG:
413
  self.answer_log = []
414
 
415
  if self.debug:
416
- print(" AnsweringRAG initialized with answer generation")
417
 
418
  def _debug_print(self, label: str, data: any):
419
- """Print debug information"""
420
  if self.debug:
421
  print(f"\n🔍 DEBUG [{label}]:")
422
  if isinstance(data, (list, dict)):
@@ -431,9 +397,9 @@ class AnsweringRAG:
431
  search_results: List[Dict]
432
  ) -> Dict:
433
  """
434
- Analyze search results and generate intelligent answer
435
 
436
- Returns:
437
  {
438
  'question': user question,
439
  'answer': detailed answer,
@@ -443,22 +409,15 @@ class AnsweringRAG:
443
  }
444
  """
445
 
446
- print(f"\n{'='*70}")
447
  print(f"ANALYZING QUESTION & GENERATING ANSWER")
448
- print(f"{'='*70}")
449
 
450
- print(f"\n Question: {question}")
451
- print(f"📊 Search Results Found: {len(search_results)}")
452
 
453
- # Check if we have search results
454
  if not search_results:
455
- print(f"⚠️ No search results found!")
456
- answer = f"""I could not find relevant information in the document to answer your question: "{question}"
457
-
458
- Try:
459
- - Using different keywords
460
- - Breaking the question into smaller parts
461
- - Asking about other topics in the document"""
462
 
463
  result = {
464
  'question': question,
@@ -470,7 +429,6 @@ Try:
470
  self.answer_log.append(result)
471
  return result
472
 
473
- # Build context from search results
474
  context_parts = []
475
  for idx, result in enumerate(search_results, 1):
476
  content = result.get('content', '')
@@ -485,43 +443,39 @@ Try:
485
 
486
  full_context = "\n".join(context_parts)
487
 
488
- self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
489
 
490
- # Build prompt to analyze results and answer question
491
- analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
492
 
493
- USER QUESTION:
494
  "{question}"
495
 
496
- RELEVANT CONTENT FROM DOCUMENT:
497
  {full_context}
498
 
499
- INSTRUCTIONS:
500
- 1. Analyze the provided content carefully
501
- 2. Extract information relevant to the question
502
- 3. Synthesize a clear, comprehensive answer in {self.language}
503
- 4. If the content doesn't fully answer the question, explain what information is available
504
- 5. Be specific and cite the content when relevant
505
- 6. Structure your answer clearly with key points
506
 
507
- ANSWER:"""
508
 
509
- print(f"\n🔍 Analyzing search results...")
510
- print(f" Context size: {len(full_context)} characters")
511
  print(f" Sources: {len(search_results)}")
512
 
513
  try:
514
- # Call LLM to analyze and answer
515
  message = HumanMessage(content=analysis_prompt)
516
  response = self.llm.invoke([message])
517
  answer = response.content.strip()
518
 
519
- # Determine confidence level
520
  confidence = self._estimate_confidence(len(search_results), answer)
521
 
522
- print(f" Answer generated successfully")
523
  print(f" Confidence: {confidence}")
524
- print(f" Answer length: {len(answer)} characters")
525
 
526
  result = {
527
  'question': question,
@@ -535,8 +489,8 @@ ANSWER:"""
535
  return result
536
 
537
  except Exception as e:
538
- print(f" Error generating answer: {e}")
539
- answer = f"I encountered an error while analyzing the search results. Please try again."
540
 
541
  result = {
542
  'question': question,
@@ -551,66 +505,14 @@ ANSWER:"""
551
  return result
552
 
553
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
554
- """Estimate confidence level of answer"""
555
  answer_length = len(answer)
556
 
557
- # High confidence: multiple sources, substantial answer
558
  if sources_count >= 3 and answer_length > 500:
559
  return "high"
560
 
561
- # Medium confidence: some sources, decent answer
562
  elif sources_count >= 2 and answer_length > 200:
563
  return "medium"
564
 
565
- # Low confidence: few sources or short answer
566
  else:
567
  return "low"
568
-
569
- def get_answer_with_sources(
570
- self,
571
- question: str,
572
- search_results: List[Dict]
573
- ) -> Dict:
574
- """
575
- Get answer AND properly formatted sources
576
- Returns both answer and formatted source citations
577
- """
578
-
579
- result = self.analyze_and_answer(question, search_results)
580
-
581
- # Format sources for display
582
- formatted_sources = []
583
- for idx, source in enumerate(result['search_results'], 1):
584
- formatted_sources.append({
585
- 'index': idx,
586
- 'type': source.get('type', 'unknown'),
587
- 'content': source.get('content', ''),
588
- 'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
589
- })
590
-
591
- result['formatted_sources'] = formatted_sources
592
- return result
593
-
594
- def get_answer_log(self) -> List[Dict]:
595
- """Get all answer generation logs"""
596
- return self.answer_log
597
-
598
- def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
599
- """Pretty print answer with sources"""
600
-
601
- print(f"\n{'='*70}")
602
- print(f"ANSWER TO: {result['question']}")
603
- print(f"{'='*70}")
604
-
605
- print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
606
- print(f"{'-'*70}")
607
- print(result['answer'])
608
- print(f"{'-'*70}")
609
-
610
- if result.get('formatted_sources'):
611
- print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
612
- for source in result['formatted_sources']:
613
- print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
614
- print(f"{source['content'][:max_source_length]}...")
615
-
616
- print(f"\n{'='*70}")
 
1
  """
2
+ RAG основной pipeline
 
 
3
  """
4
  from typing import List, Dict
5
  from langchain_openai import ChatOpenAI
 
15
 
16
  class VisualMultimodalRAG:
17
  """
18
+ RAG - подготовительный этап:
19
+ 1. Кодирует изображение в base64 и отправляет в gpt-4o-mini
20
+ 2. Получает описание изображения
21
+ 3. Сохраняет описание в векторное хранилище
 
22
  """
23
 
24
  def __init__(self, api_key: str = None, debug: bool = True):
25
  api_key = api_key or OPENAI_API_KEY
26
  self.debug = debug
27
 
 
28
  self.llm = ChatOpenAI(
29
+ model_name=OPENAI_MODEL,
30
  api_key=api_key,
31
  temperature=TEMPERATURE,
32
  max_tokens=MAX_TOKENS,
 
36
  self.visual_summaries_log = []
37
 
38
  if self.debug:
39
+ print(f"VisualMultimodalRAG with {OPENAI_MODEL}")
40
 
41
  def _debug_print(self, label: str, data: any):
42
+ """Debug"""
43
  if self.debug:
44
+ print(f"\nDEBUG [{label}]:")
45
  if isinstance(data, (list, dict)):
46
  print(f" Type: {type(data).__name__}")
47
  print(f" Content: {str(data)[:300]}...")
 
49
  print(f" {data}")
50
 
51
  def _image_to_base64(self, image_path: str) -> str:
52
+ """Конвертирует изображение в base64"""
53
  try:
54
  with open(image_path, 'rb') as image_file:
55
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
 
60
 
61
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
62
  """
63
+ Отправляет в модель изображение для суммаризации
 
 
 
 
 
 
 
 
 
64
  """
65
  if not os.path.exists(image_path):
66
  return f"[Image {image_idx}: File not found - {image_path}]"
67
 
68
  try:
 
69
  image_base64 = self._image_to_base64(image_path)
70
  if not image_base64:
71
+ return f"[Image {image_idx}: Error converting to base64]"
72
 
 
73
  file_ext = Path(image_path).suffix.lower()
74
  media_type_map = {
75
  '.jpg': 'image/jpeg',
 
80
  }
81
  media_type = media_type_map.get(file_ext, 'image/png')
82
 
83
+ print(f" Analyzing image {image_idx}...")
84
 
 
85
  message = HumanMessage(
86
  content=[
87
  {
 
92
  },
93
  {
94
  "type": "text",
95
+ "text": f"""Ты - ассистент по сбору и обобщению информации. Проанализируй изображение.
96
 
97
+ По результатам анализа предоставь информацию:
98
+ 1. Что изображено на картинке - основные объекты и элементы
99
+ 2. Тип данных и содержимое - числа, графики, зависимости.
100
+ 3. Назначение изображения - для чего оно представлено и что отображает
101
+ 4. Связь с текстом
 
102
 
103
+ Будь краток и содержателен. Фокусируйся на визуальной информации.
104
 
105
+ Результат:"""
106
  }
107
  ],
108
  )
109
 
 
110
  response = self.llm.invoke([message])
111
  analysis = response.content.strip()
112
 
113
  if self.debug:
114
  self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
115
 
116
+ print(f" Image {image_idx} analyzed successfully")
117
  return analysis
118
 
119
  except Exception as e:
120
  error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
121
+ print(f" Error analyzing image {image_idx}: {e}")
122
  return error_msg
123
 
124
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
125
  """
126
+ Считывает изображения и отправляет на анализ
 
127
  """
128
  visual_analyses = []
129
 
 
131
  image_path = image.get('path', '')
132
 
133
  if not image_path:
134
+ print(f" Image {idx}: No path")
135
  continue
136
 
 
137
  visual_analysis = self.analyze_image_visually(image_path, idx)
138
 
139
  visual_analyses.append({
 
141
  'image_index': idx,
142
  'image_path': image_path,
143
  'visual_analysis': visual_analysis,
144
+ 'ocr_text': image.get('ocr_text', '')
145
  })
146
 
147
  return visual_analyses
148
 
149
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
150
  """
151
+ Отправляет куски текста на суммаризацию
152
  """
153
  chunks = []
154
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
 
160
  continue
161
 
162
  try:
163
+ prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующий кусок текста.
164
+ Выдели основные моменты, факты и идеи. Будь краток.
165
 
166
+ Текст :
167
  {chunk}
168
 
169
+ Результат:"""
170
 
171
  message = HumanMessage(content=prompt)
172
  response = self.llm.invoke([message])
 
190
 
191
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
192
  """
193
+ Отправляет таблицы на суммаризацию
194
  """
195
  summaries = []
196
 
 
201
  continue
202
 
203
  try:
204
+ prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующию таблицу.
205
+ Выдели основные моменты, числа, и значения строк/колонок. Будь краток.
206
 
207
+ Таблица:
208
  {table_content}
209
 
210
+ Результат:"""
211
 
212
  message = HumanMessage(content=prompt)
213
  response = self.llm.invoke([message])
 
238
  doc_id: str
239
  ) -> Dict:
240
  """
241
+ Основной pipeline анализирует и сохраняет документы в хранилище
 
242
  """
243
+
244
+ print(f"PROCESSING ANALYSIS: {doc_id}")
 
245
 
246
  results = {
247
  'doc_id': doc_id,
 
251
  'total_stored': 0
252
  }
253
 
254
+ print(f"\n VISUAL IMAGE ANALYSIS ({len(images)} )")
255
+
 
256
 
257
  image_analyses = self.analyze_images_visually(images)
258
  results['image_visual_analyses'] = image_analyses
259
 
 
260
  image_docs = {
261
  'text': ' | '.join([
262
  f"Image {a['image_index']}: {a['visual_analysis']}"
 
267
  }
268
 
269
  for analysis in image_analyses:
270
+ print(f" Image {analysis['image_index']}")
271
  print(f" Path: {analysis['image_path']}")
272
  print(f" Analysis: {analysis['visual_analysis'][:100]}...")
273
 
 
278
  f"{doc_id}_images_visual"
279
  )
280
  results['total_stored'] += len(image_analyses)
281
+ print(f" Stored {len(image_analyses)} imagу analyses")
282
  except Exception as e:
283
+ print(f"Error storing image analyses: {e}")
284
 
285
+ print(f"\n TEXT CHUNK SUMMARIZATION")
 
 
286
 
287
  text_summaries = self.summarize_text_chunks(text)
288
  results['text_summaries'] = text_summaries
 
295
  }
296
 
297
  for summary in text_summaries:
298
+ print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
299
 
300
  if text_summaries:
301
  try:
 
304
  f"{doc_id}_text_chunks"
305
  )
306
  results['total_stored'] += len(text_summaries)
307
+ print(f" Stored {len(text_summaries)} text chunk summaries")
308
  except Exception as e:
309
+ print(f" Error text summaries: {e}")
310
 
311
+ print(f"\n TABLE SUMMARIZATION ({len(tables)}")
 
 
312
 
313
  table_summaries = self.summarize_tables(tables)
314
  results['table_summaries'] = table_summaries
 
321
  }
322
 
323
  for summary in table_summaries:
324
+ print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
325
 
326
  if table_summaries:
327
  try:
 
330
  f"{doc_id}_tables"
331
  )
332
  results['total_stored'] += len(table_summaries)
333
+ print(f" Stored {len(table_summaries)} table summaries")
334
  except Exception as e:
335
+ print(f" Error storing table summaries: {e}")
336
+
337
+ print(f" STORAGE SUMMARY")
338
+ print(f" Images analyzed: {len(image_analyses)}")
339
+ print(f" Text chunks summarized: {len(text_summaries)}")
340
+ print(f" Tables summarized: {len(table_summaries)}")
 
 
 
341
  print(f" Total items stored in vector: {results['total_stored']}")
 
342
 
343
  self.visual_summaries_log.append(results)
344
  return results
345
 
346
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
 
347
  chunks = []
348
  start = 0
349
  while start < len(text):
 
353
  return chunks
354
 
355
  def get_visual_summaries_log(self) -> List[Dict]:
 
356
  return self.visual_summaries_log
357
 
358
 
359
  class AnsweringRAG:
360
  """
361
+ RAG - работа с ответом на запрос:
362
+ 1. Поиск в векторном хранилище
363
+ 2. Анализ результатов
364
+ 3. Предоставление ответа
365
  """
366
 
367
  def __init__(self, api_key: str = None, debug: bool = True):
 
369
  self.debug = debug
370
 
371
  self.llm = ChatOpenAI(
372
+ model_name=OPENAI_MODEL,
373
  api_key=api_key,
374
  temperature=TEMPERATURE,
375
  max_tokens=MAX_TOKENS,
 
379
  self.answer_log = []
380
 
381
  if self.debug:
382
+ print(" AnsweringRAG initialized")
383
 
384
  def _debug_print(self, label: str, data: any):
385
+ """Debug"""
386
  if self.debug:
387
  print(f"\n🔍 DEBUG [{label}]:")
388
  if isinstance(data, (list, dict)):
 
397
  search_results: List[Dict]
398
  ) -> Dict:
399
  """
400
+ Проанализируй найденные документов и на основе их предоставь ответ на вопрос пользователя
401
 
402
+ Ответ:
403
  {
404
  'question': user question,
405
  'answer': detailed answer,
 
409
  }
410
  """
411
 
 
412
  print(f"ANALYZING QUESTION & GENERATING ANSWER")
 
413
 
414
+ print(f"\n Question: {question}")
415
+ print(f" Search Results: {len(search_results)}")
416
 
 
417
  if not search_results:
418
+ print(f" No search results found!")
419
+ answer = f"""Релевантная информация в документах отсутствует: "{question}"
420
+ """
 
 
 
 
421
 
422
  result = {
423
  'question': question,
 
429
  self.answer_log.append(result)
430
  return result
431
 
 
432
  context_parts = []
433
  for idx, result in enumerate(search_results, 1):
434
  content = result.get('content', '')
 
443
 
444
  full_context = "\n".join(context_parts)
445
 
446
+ self._debug_print("Context Prepared", f"{len(context_parts)} sources")
447
 
448
+ analysis_prompt = f"""Ты - ассистент по анализу документов и ответов на вопросы по ним.
 
449
 
450
+ ВОПРОС:
451
  "{question}"
452
 
453
+ РЕЛЕВАНТНАЯ ИНФОРМАЦИЯ:
454
  {full_context}
455
 
456
+ ИНСТРУКЦИИ:
457
+ 1. Проанализируй предоставленный контент
458
+ 2. Выдели информацию имеющую отношение к вопросу
459
+ 3. Предоставь понятный и исчерпывающий ответ
460
+ 4. Если контент полностью не отвечает на вопрос предосавь информацию которая доступна в контенте
461
+ 5. Построй свой ответ опираясь на ключевые моменты
 
462
 
463
+ Ответ:"""
464
 
465
+ print(f"\n Analyzing search results...")
466
+ print(f" Context size: {len(full_context)} chars")
467
  print(f" Sources: {len(search_results)}")
468
 
469
  try:
 
470
  message = HumanMessage(content=analysis_prompt)
471
  response = self.llm.invoke([message])
472
  answer = response.content.strip()
473
 
 
474
  confidence = self._estimate_confidence(len(search_results), answer)
475
 
476
+ print(f" Answer generated successfully")
477
  print(f" Confidence: {confidence}")
478
+ print(f" Answer length: {len(answer)} chars")
479
 
480
  result = {
481
  'question': question,
 
489
  return result
490
 
491
  except Exception as e:
492
+ print(f" Error generating answer: {e}")
493
+ answer = f"Error while analyzing the search results."
494
 
495
  result = {
496
  'question': question,
 
505
  return result
506
 
507
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
508
+ """Уверенность в ответе на основании найденных источников информации"""
509
  answer_length = len(answer)
510
 
 
511
  if sources_count >= 3 and answer_length > 500:
512
  return "high"
513
 
 
514
  elif sources_count >= 2 and answer_length > 200:
515
  return "medium"
516
 
 
517
  else:
518
  return "low"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vector_store.py CHANGED
@@ -1,6 +1,5 @@
1
  """
2
- Vector Store and Embeddings Module using ChromaDB with sentence-transformers
3
- UPDATED for ChromaDB v0.4.22+ (auto-persist, no manual persist needed)
4
  """
5
  import os
6
  import json
@@ -12,14 +11,14 @@ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
12
 
13
 
14
  class CLIPEmbedder:
15
- """Custom embedder using sentence-transformers for multimodal content"""
16
  def __init__(self, model_name: str = EMBEDDING_MODEL):
17
- print(f"🔄 Loading embedding model: {model_name}")
18
  self.model = SentenceTransformer(model_name)
19
- print(f"Model loaded successfully")
20
 
21
  def embed(self, text: str) -> List[float]:
22
- """Generate embedding for text"""
23
  try:
24
  embedding = self.model.encode(text, convert_to_numpy=False)
25
  return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
@@ -28,7 +27,7 @@ class CLIPEmbedder:
28
  return [0.0] * EMBEDDING_DIM
29
 
30
  def embed_batch(self, texts: List[str]) -> List[List[float]]:
31
- """Generate embeddings for batch of texts"""
32
  try:
33
  embeddings = self.model.encode(texts, convert_to_numpy=False)
34
  return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
@@ -38,34 +37,31 @@ class CLIPEmbedder:
38
 
39
 
40
  class VectorStore:
41
- """Vector store manager using ChromaDB (v0.4.22+ with auto-persist)"""
42
  def __init__(self):
43
  self.persist_directory = CHROMA_DB_PATH
44
  self.embedder = CLIPEmbedder()
45
 
46
- print(f"\n🔄 Initializing ChromaDB at: {self.persist_directory}")
47
 
48
- # NEW ChromaDB v0.4.22+ - PersistentClient auto-persists
49
  try:
50
  self.client = chromadb.PersistentClient(
51
  path=self.persist_directory
52
  )
53
- print(f"ChromaDB PersistentClient initialized")
54
  except Exception as e:
55
- print(f"Error initializing ChromaDB: {e}")
56
- print(f"Trying fallback initialization...")
57
  self.client = chromadb.PersistentClient(
58
  path=self.persist_directory
59
  )
60
 
61
- # Get or create collection
62
  try:
63
- self.collection = self.client.get_or_create_collection(
64
  name="multimodal_rag",
65
  metadata={"hnsw:space": "cosine"}
66
  )
67
  count = self.collection.count()
68
- print(f"Collection loaded: {count} items in store")
69
  except Exception as e:
70
  print(f"Error with collection: {e}")
71
  self.collection = self.client.get_or_create_collection(
@@ -73,14 +69,13 @@ class VectorStore:
73
  )
74
 
75
  def add_documents(self, documents: List[Dict], doc_id: str):
76
- """Add documents to vector store"""
77
  texts = []
78
  metadatas = []
79
  ids = []
80
 
81
- print(f"\n📚 Adding documents for: {doc_id}")
82
 
83
- # Add text chunks
84
  if 'text' in documents and documents['text']:
85
  chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
86
  for idx, chunk in enumerate(chunks):
@@ -91,9 +86,8 @@ class VectorStore:
91
  'chunk_idx': str(idx)
92
  })
93
  ids.append(f"{doc_id}_text_{idx}")
94
- print(f" Text: {len(chunks)} chunks")
95
 
96
- # Add image descriptions and OCR text
97
  if 'images' in documents:
98
  image_count = 0
99
  for idx, image_data in enumerate(documents['images']):
@@ -108,9 +102,8 @@ class VectorStore:
108
  ids.append(f"{doc_id}_image_{idx}")
109
  image_count += 1
110
  if image_count > 0:
111
- print(f" Images: {image_count} with OCR text")
112
 
113
- # Add table content
114
  if 'tables' in documents:
115
  table_count = 0
116
  for idx, table_data in enumerate(documents['tables']):
@@ -124,14 +117,12 @@ class VectorStore:
124
  ids.append(f"{doc_id}_table_{idx}")
125
  table_count += 1
126
  if table_count > 0:
127
- print(f" Tables: {table_count}")
128
 
129
  if texts:
130
- # Generate embeddings
131
  print(f" 🔄 Generating {len(texts)} embeddings...")
132
  embeddings = self.embedder.embed_batch(texts)
133
 
134
- # Add to collection
135
  try:
136
  self.collection.add(
137
  ids=ids,
@@ -139,14 +130,12 @@ class VectorStore:
139
  embeddings=embeddings,
140
  metadatas=metadatas
141
  )
142
- print(f"Successfully added {len(texts)} items to vector store")
143
- # Auto-persist happens here
144
- print(f"✅ Data persisted automatically to: {self.persist_directory}")
145
  except Exception as e:
146
- print(f"Error adding to collection: {e}")
147
 
148
  def search(self, query: str, n_results: int = 5) -> List[Dict]:
149
- """Search vector store for similar documents"""
150
  try:
151
  query_embedding = self.embedder.embed(query)
152
 
@@ -155,7 +144,6 @@ class VectorStore:
155
  n_results=n_results
156
  )
157
 
158
- # Format results
159
  formatted_results = []
160
  if results['documents']:
161
  for i, doc in enumerate(results['documents'][0]):
@@ -175,7 +163,7 @@ class VectorStore:
175
  return []
176
 
177
  def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
178
- """Split text into chunks with overlap"""
179
  chunks = []
180
  start = 0
181
  while start < len(text):
@@ -185,7 +173,7 @@ class VectorStore:
185
  return chunks
186
 
187
  def get_collection_info(self) -> Dict:
188
- """Get information about the collection"""
189
  try:
190
  count = self.collection.count()
191
  return {
@@ -199,35 +187,23 @@ class VectorStore:
199
  return {'status': 'error', 'message': str(e)}
200
 
201
  def delete_by_doc_id(self, doc_id: str):
202
- """Delete all documents related to a specific doc_id"""
203
  try:
204
- # Get all IDs with this doc_id
205
  results = self.collection.get(where={'doc_id': doc_id})
206
  if results['ids']:
207
  self.collection.delete(ids=results['ids'])
208
- print(f"Deleted {len(results['ids'])} documents for {doc_id}")
209
- # Auto-persist on delete
210
- print(f"✅ Changes persisted automatically")
211
  except Exception as e:
212
  print(f"Error deleting documents: {e}")
213
 
214
- def persist(self):
215
- """
216
- No-op for compatibility with older code.
217
- ChromaDB v0.4.22+ uses PersistentClient which auto-persists.
218
- This method kept for backward compatibility.
219
- """
220
- print("✅ Vector store is using auto-persist (no manual persist needed)")
221
-
222
  def clear_all(self):
223
- """Clear all documents from collection"""
224
  try:
225
- # Delete collection and recreate
226
  self.client.delete_collection(name="multimodal_rag")
227
  self.collection = self.client.get_or_create_collection(
228
  name="multimodal_rag",
229
  metadata={"hnsw:space": "cosine"}
230
  )
231
- print("Collection cleared and reset")
232
  except Exception as e:
233
  print(f"Error clearing collection: {e}")
 
1
  """
2
+ Векторное хранилище и Эмбеддер"
 
3
  """
4
  import os
5
  import json
 
11
 
12
 
13
  class CLIPEmbedder:
14
+ """Эмбеддер"""
15
  def __init__(self, model_name: str = EMBEDDING_MODEL):
16
+ print(f"Embedding model: {model_name}")
17
  self.model = SentenceTransformer(model_name)
18
+ print(f"Model loaded successfully")
19
 
20
  def embed(self, text: str) -> List[float]:
21
+ """Эмбеддинг для текста"""
22
  try:
23
  embedding = self.model.encode(text, convert_to_numpy=False)
24
  return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
 
27
  return [0.0] * EMBEDDING_DIM
28
 
29
  def embed_batch(self, texts: List[str]) -> List[List[float]]:
30
+ """Эмбеддинг для текста"""
31
  try:
32
  embeddings = self.model.encode(texts, convert_to_numpy=False)
33
  return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
 
37
 
38
 
39
  class VectorStore:
40
+ """Векторное хранилище"""
41
  def __init__(self):
42
  self.persist_directory = CHROMA_DB_PATH
43
  self.embedder = CLIPEmbedder()
44
 
45
+ print(f"\nInitializing ChromaDB: {self.persist_directory}")
46
 
 
47
  try:
48
  self.client = chromadb.PersistentClient(
49
  path=self.persist_directory
50
  )
51
+ print(f"ChromaDB initialized")
52
  except Exception as e:
53
+ print(f"Error initializing ChromaDB: {e}")
 
54
  self.client = chromadb.PersistentClient(
55
  path=self.persist_directory
56
  )
57
 
 
58
  try:
59
+ self.collection = self.client.get_or_create_colletion(
60
  name="multimodal_rag",
61
  metadata={"hnsw:space": "cosine"}
62
  )
63
  count = self.collection.count()
64
+ print(f"Collection loaded: {count} items in store")
65
  except Exception as e:
66
  print(f"Error with collection: {e}")
67
  self.collection = self.client.get_or_create_collection(
 
69
  )
70
 
71
  def add_documents(self, documents: List[Dict], doc_id: str):
72
+ """Добавление документов в векторное хранилище"""
73
  texts = []
74
  metadatas = []
75
  ids = []
76
 
77
+ print(f"\nAdding document: {doc_id}")
78
 
 
79
  if 'text' in documents and documents['text']:
80
  chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
81
  for idx, chunk in enumerate(chunks):
 
86
  'chunk_idx': str(idx)
87
  })
88
  ids.append(f"{doc_id}_text_{idx}")
89
+ print(f" Text: {len(chunks)} chunks")
90
 
 
91
  if 'images' in documents:
92
  image_count = 0
93
  for idx, image_data in enumerate(documents['images']):
 
102
  ids.append(f"{doc_id}_image_{idx}")
103
  image_count += 1
104
  if image_count > 0:
105
+ print(f" Images: {image_count} with OCR text")
106
 
 
107
  if 'tables' in documents:
108
  table_count = 0
109
  for idx, table_data in enumerate(documents['tables']):
 
117
  ids.append(f"{doc_id}_table_{idx}")
118
  table_count += 1
119
  if table_count > 0:
120
+ print(f" Tables: {table_count}")
121
 
122
  if texts:
 
123
  print(f" 🔄 Generating {len(texts)} embeddings...")
124
  embeddings = self.embedder.embed_batch(texts)
125
 
 
126
  try:
127
  self.collection.add(
128
  ids=ids,
 
130
  embeddings=embeddings,
131
  metadatas=metadatas
132
  )
133
+ print(f"Successfully added {len(texts)} items to vector store")
 
 
134
  except Exception as e:
135
+ print(f"Error adding to collection: {e}")
136
 
137
  def search(self, query: str, n_results: int = 5) -> List[Dict]:
138
+ """Поиск в векторном хранилище"""
139
  try:
140
  query_embedding = self.embedder.embed(query)
141
 
 
144
  n_results=n_results
145
  )
146
 
 
147
  formatted_results = []
148
  if results['documents']:
149
  for i, doc in enumerate(results['documents'][0]):
 
163
  return []
164
 
165
  def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
166
+ """Сплит текста"""
167
  chunks = []
168
  start = 0
169
  while start < len(text):
 
173
  return chunks
174
 
175
  def get_collection_info(self) -> Dict:
176
+ """Получение информации о коллекции в вектороном хранилище"""
177
  try:
178
  count = self.collection.count()
179
  return {
 
187
  return {'status': 'error', 'message': str(e)}
188
 
189
  def delete_by_doc_id(self, doc_id: str):
190
+ """Удаление документа из векторного хранилища"""
191
  try:
 
192
  results = self.collection.get(where={'doc_id': doc_id})
193
  if results['ids']:
194
  self.collection.delete(ids=results['ids'])
195
+ print(f"Deleted {len(results['ids'])} documents for {doc_id}")
 
 
196
  except Exception as e:
197
  print(f"Error deleting documents: {e}")
198
 
 
 
 
 
 
 
 
 
199
  def clear_all(self):
200
+ """Очистка хранилища"""
201
  try:
 
202
  self.client.delete_collection(name="multimodal_rag")
203
  self.collection = self.client.get_or_create_collection(
204
  name="multimodal_rag",
205
  metadata={"hnsw:space": "cosine"}
206
  )
207
+ print("Collection cleared")
208
  except Exception as e:
209
  print(f"Error clearing collection: {e}")