dnj0 commited on
Commit
34bfedc
·
1 Parent(s): 00cf41b
Files changed (5) hide show
  1. src/app.py +82 -107
  2. src/config.py +11 -7
  3. src/pdf_parser.py +24 -70
  4. src/rag_system.py +106 -239
  5. src/vector_store.py +49 -170
src/app.py CHANGED
@@ -1,33 +1,25 @@
1
- """
2
- UI RAG
3
- """
4
-
5
  import streamlit as st
6
  import os
7
  from pathlib import Path
8
-
9
  from pdf_parser import PDFParser
10
  from vector_store import VectorStore
11
- from rag_system import VisualMultimodalRAG
12
  from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
13
 
14
-
15
-
16
  st.set_page_config(
17
- page_title="Мультимодальная RAG система (PDF parsing)",
 
18
  layout="wide",
19
  initial_sidebar_state="expanded"
20
  )
21
 
22
-
23
-
24
  if 'api_key_set' not in st.session_state:
25
  st.session_state.api_key_set = False
26
 
27
  if 'api_key' not in st.session_state:
28
  st.session_state.api_key = None
29
 
30
- if 'visual_rag_system' not in st.session_state:
31
  st.session_state.visual_rag_system = None
32
 
33
  if 'vector_store' not in st.session_state:
@@ -54,22 +46,20 @@ if 'processing_results' not in st.session_state:
54
  if 'answering_rag' not in st.session_state:
55
  st.session_state.answering_rag = None
56
 
 
57
 
58
-
59
- st.title("Мультимодальная RAG система (PDF parsing)")
60
  st.markdown("""
61
- Обрабатывает PDF документы и предоставляет информацию по ним
62
- """)
63
-
64
 
 
 
65
 
66
  with st.sidebar:
67
- st.header(" Конфигурация")
68
 
69
- st.subheader(" OpenAI API Ключ")
70
 
71
  api_key = st.text_input(
72
- "Введите OpenAI API ключ:",
73
  type="password",
74
  key="api_key_input"
75
  )
@@ -80,70 +70,65 @@ with st.sidebar:
80
 
81
  if st.session_state.visual_rag_system is None:
82
  try:
83
- st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW
84
  st.session_state.vector_store = VectorStore()
85
  st.session_state.parser = PDFParser(debug=True)
86
- st.success("API ключ введен")
87
  except Exception as e:
88
- st.error(f"Ошибка старта системы: {e}")
89
  else:
90
  st.session_state.api_key_set = False
91
- st.warning("Введите OpenAI API ключ")
92
 
93
  st.divider()
94
 
95
  st.subheader("Векторное хранилище")
 
96
  if st.session_state.vector_store:
97
  try:
98
  info = st.session_state.vector_store.get_collection_info()
99
- st.metric("Документов в хранилище", info['count'])
100
- st.caption(f"Расположение: {info['persist_path']}")
101
  except Exception as e:
102
- st.error(f"Ошибка получения информации: {e}")
103
  else:
104
- st.info("Введите OpenAI API ключ")
105
 
106
  st.divider()
107
 
108
- st.subheader("Управление хранилищем")
109
- if st.button("Очистить хранилище"):
 
110
  if st.session_state.vector_store:
111
  try:
112
  st.session_state.vector_store.clear_all()
113
- st.success("Хранилище очищено")
114
  except Exception as e:
115
- st.error(f"Ошибка очистки хранилища: {e}")
116
-
117
 
118
-
119
- st.header("Загрузить PDF")
120
 
121
  uploaded_file = st.file_uploader(
122
- "Выбрать...",
123
  type=['pdf'],
124
- help="Загрузите PDF файл"
125
  )
126
 
127
  if uploaded_file is not None:
128
  upload_path = Path(UPLOAD_FOLDER)
129
  upload_path.mkdir(exist_ok=True)
130
-
131
  file_path = upload_path / uploaded_file.name
132
  with open(file_path, 'wb') as f:
133
  f.write(uploaded_file.getbuffer())
134
-
135
- st.success(f"Файл загружен: {uploaded_file.name}")
136
 
137
  if st.button("Распарсить PDF"):
138
  if not st.session_state.api_key_set:
139
- st.error("Введите OpenAI API ключ")
140
  else:
141
  try:
142
- with st.spinner(" Парсинг PDF..."):
143
-
144
- print(f"Парсинг PDF файла: {uploaded_file.name}")
145
-
146
-
147
  parser = st.session_state.parser
148
  text, images, tables = parser.parse_pdf(str(file_path))
149
 
@@ -154,42 +139,45 @@ if uploaded_file is not None:
154
 
155
  col1, col2, col3 = st.columns(3)
156
  with col1:
157
- st.metric("Текста", f"{len(text):,} chars")
158
  with col2:
159
- st.metric("Изображений", len(images))
160
  with col3:
161
- st.metric("Таблиц", len(tables))
162
 
163
- st.success("Парсинг PDF завершен!")
 
 
 
 
 
 
 
 
164
 
 
165
  except Exception as e:
166
- st.error(f"Парсинг PDF завершелся с ошибкой: {e}")
167
- print(f"Ошибка: {e}")
168
-
169
-
170
 
171
  st.divider()
172
- st.header("Анализ документа")
173
 
174
- st.info("""
175
- Отправляет содержимое документа на анализ
176
- """)
177
 
178
- if st.button("Проанализировать документ"):
 
179
  if not st.session_state.api_key_set:
180
- st.error("Введите OpenAI API ключ")
181
  elif st.session_state.current_text is None:
182
- st.error("Распарсите документ")
183
  else:
184
  try:
185
- with st.spinner("Анализ с gpt-4o-mini..."):
186
-
187
  visual_rag = st.session_state.visual_rag_system
188
  vector_store = st.session_state.vector_store
189
-
190
  results = visual_rag.process_and_store_document(
191
  text=st.session_state.current_text,
192
- images=st.session_state.current_images,
193
  tables=st.session_state.current_tables,
194
  vector_store=vector_store,
195
  doc_id=st.session_state.current_document or "current_doc"
@@ -197,53 +185,45 @@ if st.button("Проанализировать документ"):
197
 
198
  st.session_state.processing_results = results
199
 
200
- st.success("Анализ готов!")
201
 
202
  col1, col2, col3 = st.columns(3)
203
  with col1:
204
  st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
205
  with col2:
206
- st.metric("Проанализировано чанков текста", len(results['text_summaries']))
207
  with col3:
208
  st.metric("Проанализировано та��лиц", len(results['table_summaries']))
209
 
210
- st.metric("Помещено в хранилище", results['total_stored'])
211
-
212
-
213
- print(f"Анализ завершен")
214
 
215
  except Exception as e:
216
- st.error(f"Ошибка в ходе: {e}")
217
- print(f"Ошибка: {e}")
218
-
219
-
220
 
221
  st.divider()
222
- st.header("Работа с документом")
223
 
224
- if 'answering_rag' not in st.session_state:
225
- st.session_state.answering_rag = None
226
 
227
  if st.session_state.api_key_set and st.session_state.answering_rag is None:
228
- from rag_system import AnsweringRAG
229
  st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
230
 
231
  question = st.text_area(
232
- "Введите запрос:",
233
  height=100,
234
- placeholder="О чем данный документ?"
235
  )
236
 
237
- if st.button("Генерация ответа"):
238
  if not st.session_state.api_key_set:
239
- st.error("Введите OpenAI API ключ")
240
  elif st.session_state.current_text is None:
241
- st.error("Распарсите документ")
242
  elif not question:
243
- st.error("Введите запрос")
244
  else:
245
  try:
246
- with st.spinner("Поиск документов..."):
 
247
  store = st.session_state.vector_store
248
 
249
  doc_name = st.session_state.current_document or "current_doc"
@@ -252,55 +232,50 @@ if st.button("Генерация ответа"):
252
  'images': [],
253
  'tables': []
254
  }
 
255
  store.add_documents(doc_data, doc_name)
256
 
257
  search_results = store.search(question, n_results=5)
258
 
259
- print(f"Найдено: {len(search_results)}")
260
-
261
  answering_rag = st.session_state.answering_rag
262
  result = answering_rag.analyze_and_answer(question, search_results)
263
 
264
- st.success("Поиск завершен!")
265
 
266
  st.subheader("Ответ")
267
 
268
  col1, col2, col3 = st.columns(3)
269
  with col1:
270
- confidence_color = {
271
- 'high': '🟢',
272
- 'medium': '🟡',
273
- 'low': '🔴'
274
- }.get(result['confidence'], '⚪')
275
- st.metric("Уверенность в ответе", f"{confidence_color} {result['confidence'].upper()}")
 
276
  with col2:
277
  st.metric("Использовано источников", result['sources_used'])
278
  with col3:
279
  if result['sources_used'] > 0:
280
- st.metric("Среднняя релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
281
 
282
  st.write(result['answer'])
283
 
284
  if st.checkbox("Показать исходные документы"):
285
- st.subheader("Использованы документы")
286
- for idx, source in enumerate(result['formatted_sources'], 1):
287
  relevance = source['relevance']
288
- relevance_bar = "|" * int(relevance * 10) + "|" * (10 - int(relevance * 10))
289
-
290
  with st.expander(
291
- f"Источник {idx} - {source['type'].upper()} "
292
  f"[{relevance_bar}] {relevance:.0%}"
293
  ):
294
  st.write(source['content'])
295
-
296
- print(f" Ответ готов!")
297
-
298
  except Exception as e:
299
- st.error(f"Ошибка обработки запроса: {e}")
300
- print(f"Ошибка: {e}")
301
 
302
  st.divider()
303
 
304
  st.caption(
305
- "Мультимодальная RAG система для парсинга PDF документов"
306
  )
 
 
 
 
 
1
  import streamlit as st
2
  import os
3
  from pathlib import Path
 
4
  from pdf_parser import PDFParser
5
  from vector_store import VectorStore
6
+ from rag_system import VisualMultimodalRAG, AnsweringRAG
7
  from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
8
 
 
 
9
  st.set_page_config(
10
+ page_title="Мультимодальная система RAG LLM",
11
+ page_icon="",
12
  layout="wide",
13
  initial_sidebar_state="expanded"
14
  )
15
 
 
 
16
  if 'api_key_set' not in st.session_state:
17
  st.session_state.api_key_set = False
18
 
19
  if 'api_key' not in st.session_state:
20
  st.session_state.api_key = None
21
 
22
+ if 'visual_rag_system' not in st.session_state:
23
  st.session_state.visual_rag_system = None
24
 
25
  if 'vector_store' not in st.session_state:
 
46
  if 'answering_rag' not in st.session_state:
47
  st.session_state.answering_rag = None
48
 
49
+ st.title("Мультимодальная система RAG LLM")
50
 
 
 
51
  st.markdown("""
 
 
 
52
 
53
+ Обработка PDF-документов с анализом визуального контента
54
+ """)
55
 
56
  with st.sidebar:
57
+ st.header("Конфигурация")
58
 
59
+ st.subheader("Ключ API OpenAI")
60
 
61
  api_key = st.text_input(
62
+ "Введите ваш ключ API OpenAI:",
63
  type="password",
64
  key="api_key_input"
65
  )
 
70
 
71
  if st.session_state.visual_rag_system is None:
72
  try:
73
+ st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)
74
  st.session_state.vector_store = VectorStore()
75
  st.session_state.parser = PDFParser(debug=True)
76
+ st.success("Ключ API установлен")
77
  except Exception as e:
78
+ st.error(f"Ошибка при инициализации систем: {e}")
79
  else:
80
  st.session_state.api_key_set = False
81
+ st.warning("Введите ключ API для продолжения")
82
 
83
  st.divider()
84
 
85
  st.subheader("Векторное хранилище")
86
+
87
  if st.session_state.vector_store:
88
  try:
89
  info = st.session_state.vector_store.get_collection_info()
90
+ st.metric("Элементов в хранилище", info['count'])
91
+ st.caption(f"Путь: {info['persist_path']}")
92
  except Exception as e:
93
+ st.error(f"Ошибка получения информации о хранилище: {e}")
94
  else:
95
+ st.info("Установите ключ API для инициализации векторного хранилища")
96
 
97
  st.divider()
98
 
99
+ st.subheader("Управление документами")
100
+
101
+ if st.button("Очистить векторное хранилище"):
102
  if st.session_state.vector_store:
103
  try:
104
  st.session_state.vector_store.clear_all()
105
+ st.success("Векторное хранилище очищено")
106
  except Exception as e:
107
+ st.error(f"Ошибка при очистке хранилища: {e}")
 
108
 
109
+ st.header("Загрузка PDF-документа")
 
110
 
111
  uploaded_file = st.file_uploader(
112
+ "Выберите PDF-файл",
113
  type=['pdf'],
114
+ help="PDF с текстом, изображениями и таблицами"
115
  )
116
 
117
  if uploaded_file is not None:
118
  upload_path = Path(UPLOAD_FOLDER)
119
  upload_path.mkdir(exist_ok=True)
 
120
  file_path = upload_path / uploaded_file.name
121
  with open(file_path, 'wb') as f:
122
  f.write(uploaded_file.getbuffer())
123
+ st.success(f"Файл сохранён: {uploaded_file.name}")
 
124
 
125
  if st.button("Распарсить PDF"):
126
  if not st.session_state.api_key_set:
127
+ st.error("Введите ключ API для продолжения")
128
  else:
129
  try:
130
+ with st.spinner("Парсинг PDF..."):
131
+ print("PARSING: " + uploaded_file.name)
 
 
 
132
  parser = st.session_state.parser
133
  text, images, tables = parser.parse_pdf(str(file_path))
134
 
 
139
 
140
  col1, col2, col3 = st.columns(3)
141
  with col1:
142
+ st.metric("Текст", f"{len(text):,} символов")
143
  with col2:
144
+ st.metric("Изображения", len(images))
145
  with col3:
146
+ st.metric("Таблицы", len(tables))
147
 
148
+ # if images:
149
+ # st.subheader("Извлечённые изображения")
150
+ # for idx, img in enumerate(images):
151
+ # ocr_text = img.get('ocr_text', '')
152
+ # ocr_len = len(ocr_text)
153
+ # if ocr_len > 0:
154
+ # st.success(f"Изображение {idx}: {ocr_len} символов (OCR)")
155
+ # else:
156
+ # st.warning(f"Изображение {idx}: Текст OCR не найден (будет использоваться визуальный анализ)")
157
 
158
+ st.success("Парсинг PDF завершён!")
159
  except Exception as e:
160
+ st.error(f"Ошибка при парсинге PDF: {e}")
 
 
 
161
 
162
  st.divider()
 
163
 
164
+ st.header("Анализ")
 
 
165
 
166
+
167
+ if st.button("Анализировать"):
168
  if not st.session_state.api_key_set:
169
+ st.error("Введите ключ API для продолжения")
170
  elif st.session_state.current_text is None:
171
+ st.error("Распарсьте PDF-документ")
172
  else:
173
  try:
174
+ with st.spinner("Анализ изображений с помощью gpt-4o-mini..."):
175
+ print("ANALYSIS")
176
  visual_rag = st.session_state.visual_rag_system
177
  vector_store = st.session_state.vector_store
 
178
  results = visual_rag.process_and_store_document(
179
  text=st.session_state.current_text,
180
+ images=st.session_state.current_images,
181
  tables=st.session_state.current_tables,
182
  vector_store=vector_store,
183
  doc_id=st.session_state.current_document or "current_doc"
 
185
 
186
  st.session_state.processing_results = results
187
 
188
+ st.success("Анализ завершён и сохранён!")
189
 
190
  col1, col2, col3 = st.columns(3)
191
  with col1:
192
  st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
193
  with col2:
194
+ st.metric("Фрагментов текста", len(results['text_summaries']))
195
  with col3:
196
  st.metric("Проанализировано та��лиц", len(results['table_summaries']))
197
 
198
+ st.metric("Всего сохранено в вектор", results['total_stored'])
 
 
 
199
 
200
  except Exception as e:
201
+ st.error(f"Ошибка при анализе: {e}")
 
 
 
202
 
203
  st.divider()
 
204
 
205
+ st.header("Задать вопрос о документе")
 
206
 
207
  if st.session_state.api_key_set and st.session_state.answering_rag is None:
 
208
  st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
209
 
210
  question = st.text_area(
211
+ "Введите ваш вопрос:",
212
  height=100,
213
+ placeholder="О чем говорится в документе?"
214
  )
215
 
216
+ if st.button("Поиск и генерация ответа"):
217
  if not st.session_state.api_key_set:
218
+ st.error("Введите ключ API для продолжения")
219
  elif st.session_state.current_text is None:
220
+ st.error("Распарсьте PDF-документ")
221
  elif not question:
222
+ st.error("Введите вопрос")
223
  else:
224
  try:
225
+ with st.spinner("Поиск в документе и анализ..."):
226
+ print("QUESTION: " + question)
227
  store = st.session_state.vector_store
228
 
229
  doc_name = st.session_state.current_document or "current_doc"
 
232
  'images': [],
233
  'tables': []
234
  }
235
+
236
  store.add_documents(doc_data, doc_name)
237
 
238
  search_results = store.search(question, n_results=5)
239
 
 
 
240
  answering_rag = st.session_state.answering_rag
241
  result = answering_rag.analyze_and_answer(question, search_results)
242
 
243
+ st.success("Анализ завершён!")
244
 
245
  st.subheader("Ответ")
246
 
247
  col1, col2, col3 = st.columns(3)
248
  with col1:
249
+ confidence_map = {
250
+ 'high': 'ВЫСОКАЯ',
251
+ 'medium': 'СРЕДНЯЯ',
252
+ 'low': 'НИЗКАЯ'
253
+ }
254
+ confidence_text = confidence_map.get(result['confidence'], result['confidence'].upper())
255
+ st.metric("Уверенность", confidence_text)
256
  with col2:
257
  st.metric("Использовано источников", result['sources_used'])
258
  with col3:
259
  if result['sources_used'] > 0:
260
+ st.metric("Сред. релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
261
 
262
  st.write(result['answer'])
263
 
264
  if st.checkbox("Показать исходные документы"):
265
+ st.subheader("Источники, использованные в ответе")
266
+ for source in result.get('formatted_sources', []):
267
  relevance = source['relevance']
268
+ relevance_bar = "" * int(relevance * 10) + "" * (10 - int(relevance * 10))
 
269
  with st.expander(
270
+ f"Источник {source['index']} - {source['type'].upper()} "
271
  f"[{relevance_bar}] {relevance:.0%}"
272
  ):
273
  st.write(source['content'])
 
 
 
274
  except Exception as e:
275
+ st.error(f"Ошибка при обработке вопроса: {e}")
 
276
 
277
  st.divider()
278
 
279
  st.caption(
280
+ "Мультимодальная система RAG"
281
  )
src/config.py CHANGED
@@ -1,34 +1,38 @@
1
- """
2
- Конфигурационный файл
3
- """
4
  import os
5
  from pathlib import Path
6
 
7
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 
8
  OPENAI_MODEL = "gpt-4o-mini"
 
9
  USE_CACHE = True
10
 
11
  CHROMA_DB_PATH = "./chroma_db"
 
12
  DOCSTORE_PATH = "./docstore"
 
13
  PROCESSED_FILES_LOG = "./processed_files.txt"
14
 
15
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 
16
  EMBEDDING_DIM = 768
17
 
18
  MAX_CHUNK_SIZE = 500
 
19
  CHUNK_OVERLAP = 50
 
20
  TEMPERATURE = 0.3
 
21
  MAX_TOKENS = 500
22
 
23
  LANGUAGE = "russian"
24
 
25
  Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 
26
  Path(DOCSTORE_PATH).mkdir(exist_ok=True)
27
 
28
  UPLOAD_FOLDER = "./uploaded_pdfs"
 
29
  Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
30
- MAX_PDF_SIZE_MB = 50
31
 
32
- BATCH_SEARCH_RESULTS = 3
33
- CACHE_RESPONSES = True
34
- SUMMARIZE_FIRST = True
 
 
 
 
1
  import os
2
  from pathlib import Path
3
 
4
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
5
+
6
  OPENAI_MODEL = "gpt-4o-mini"
7
+
8
  USE_CACHE = True
9
 
10
  CHROMA_DB_PATH = "./chroma_db"
11
+
12
  DOCSTORE_PATH = "./docstore"
13
+
14
  PROCESSED_FILES_LOG = "./processed_files.txt"
15
 
16
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
17
+
18
  EMBEDDING_DIM = 768
19
 
20
  MAX_CHUNK_SIZE = 500
21
+
22
  CHUNK_OVERLAP = 50
23
+
24
  TEMPERATURE = 0.3
25
+
26
  MAX_TOKENS = 500
27
 
28
  LANGUAGE = "russian"
29
 
30
  Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
31
+
32
  Path(DOCSTORE_PATH).mkdir(exist_ok=True)
33
 
34
  UPLOAD_FOLDER = "./uploaded_pdfs"
35
+
36
  Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
 
37
 
38
+ MAX_PDF_SIZE_MB = 50
 
 
src/pdf_parser.py CHANGED
@@ -1,6 +1,3 @@
1
- """
2
- PDF Парсер
3
- """
4
  import os
5
  import json
6
  import hashlib
@@ -12,34 +9,37 @@ from PIL import Image
12
  import pytesseract
13
  from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
14
 
15
-
16
  class PDFParser:
17
  def __init__(self, debug: bool = True):
18
  self.docstore_path = Path(DOCSTORE_PATH)
19
  self.docstore_path.mkdir(exist_ok=True)
20
  self.processed_files = self._load_processed_files()
21
  self.debug = debug
22
-
23
-
24
  if self.debug:
25
  print("PDFParser initialized")
26
 
 
 
 
 
 
 
 
27
  def _debug_print(self, label: str, data: any):
28
- """Debug"""
29
  if self.debug:
30
- print(f"\n🔍 [PDF Parser] {label}")
31
  if isinstance(data, dict):
32
  for key, val in data.items():
33
- print(f" {key}: {val}")
34
  elif isinstance(data, (list, tuple)):
35
- print(f" Count: {len(data)}")
36
  for i, item in enumerate(data[:3]):
37
- print(f" [{i}]: {str(item)[:100]}")
38
  else:
39
- print(f" {data}")
40
 
41
  def _load_processed_files(self) -> Dict[str, str]:
42
- """Подгрузка обработанных файлов"""
43
  if os.path.exists(PROCESSED_FILES_LOG):
44
  try:
45
  with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -49,12 +49,10 @@ class PDFParser:
49
  return {}
50
 
51
  def _save_processed_files(self):
52
- """Сохранение обработанных файлов"""
53
  with open(PROCESSED_FILES_LOG, 'w') as f:
54
  json.dump(self.processed_files, f, indent=2)
55
 
56
  def _get_file_hash(self, file_path: str) -> str:
57
- """Проверка изменения файлов"""
58
  hash_md5 = hashlib.md5()
59
  with open(file_path, "rb") as f:
60
  for chunk in iter(lambda: f.read(4096), b""):
@@ -62,56 +60,43 @@ class PDFParser:
62
  return hash_md5.hexdigest()
63
 
64
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
65
- """Извлечение текста из PDF"""
66
  text = ""
67
  try:
68
  with open(pdf_path, 'rb') as file:
69
  reader = PyPDF2.PdfReader(file)
70
  page_count = len(reader.pages)
71
  self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
72
-
73
  for page_num, page in enumerate(reader.pages):
74
  page_text = page.extract_text()
75
  text += page_text + "\n"
76
  self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
77
  except Exception as e:
78
  self._debug_print("ERROR extracting text", str(e))
79
-
80
  self._debug_print("Total Text Extracted", len(text))
81
  return text
82
 
83
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
84
- """Извлечение изображений из PDF"""
85
  images_data = []
86
  try:
87
- self._debug_print("Image extraction", f"File: {pdf_path}")
88
-
89
  images = convert_from_path(pdf_path, dpi=150)
90
- self._debug_print(f"Total images: {len(images)}")
91
-
92
  for idx, image in enumerate(images):
93
- self._debug_print(f"Image {idx}", f"Size: {image.size}")
94
-
95
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
96
  image.save(image_path)
97
  self._debug_print(f"Image {idx} Saved", str(image_path))
98
-
99
- self._debug_print(f"Image {idx} OCR", "Running OCR...")
100
-
101
  try:
102
  ocr_text = pytesseract.image_to_string(image, lang='rus')
103
-
104
  ocr_text = ocr_text.strip()
105
-
106
  if not ocr_text or len(ocr_text) < 5:
107
- self._debug_print(f"Image {idx} OCR Result", f"WARN ({len(ocr_text)} chars)")
108
  else:
109
- self._debug_print(f"Image {idx} OCR Result", f"SUCCESS {len(ocr_text)} chars: {ocr_text[:150]}")
110
-
111
  except Exception as ocr_error:
112
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
113
- ocr_text = f"[Image {idx}: OCR failed {str(ocr_error)}]"
114
-
115
  images_data.append({
116
  'page': idx,
117
  'path': str(image_path),
@@ -120,19 +105,15 @@ class PDFParser:
120
  })
121
  except Exception as e:
122
  self._debug_print("ERROR extracting images", str(e))
123
-
124
  self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
125
  return images_data
126
 
127
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
128
- """Извлечение таблиц из PDF"""
129
  tables_data = []
130
  try:
131
  text = self._extract_text_from_pdf(pdf_path)
132
  lines = text.split('\n')
133
-
134
- self._debug_print("Table extraction", f"Scanning {len(lines)} lines")
135
-
136
  current_table = []
137
  for line in lines:
138
  if '|' in line or '\t' in line:
@@ -144,53 +125,40 @@ class PDFParser:
144
  'description': f"Table {len(tables_data) + 1}"
145
  })
146
  current_table = []
147
-
148
  if current_table and len(current_table) > 1:
149
  tables_data.append({
150
  'content': '\n'.join(current_table),
151
  'description': f"Table {len(tables_data) + 1}"
152
  })
153
-
154
  self._debug_print("Tables Found", len(tables_data))
155
  except Exception as e:
156
  self._debug_print("ERROR extracting tables", str(e))
157
-
158
  return tables_data
159
 
160
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
161
- """Парсинг PDF"""
162
  file_hash = self._get_file_hash(pdf_path)
163
  doc_id = Path(pdf_path).stem
164
-
165
  self._debug_print("PDF Parsing Started", f"File: {doc_id}")
166
-
167
  if doc_id in self.processed_files:
168
  if self.processed_files[doc_id] == file_hash:
169
  self._debug_print("Status", f"File {doc_id} already processed")
170
  return self._load_extracted_data(doc_id)
171
-
172
- print(f"\nProcessing PDF: {doc_id}")
173
-
174
  text = self._extract_text_from_pdf(pdf_path)
175
  images = self._extract_images_from_pdf(pdf_path, doc_id)
176
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
177
-
178
- self._debug_print("Summary", {
179
  'text_length': len(text),
180
  'images_count': len(images),
181
  'tables_count': len(tables),
182
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
183
  })
184
-
185
  self._save_extracted_data(doc_id, text, images, tables)
186
-
187
  self.processed_files[doc_id] = file_hash
188
  self._save_processed_files()
189
-
190
  return text, images, tables
191
 
192
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
193
- """Сохранение извелеченных данных в Docstore"""
194
  data = {
195
  'text': text,
196
  'images': images,
@@ -199,27 +167,13 @@ class PDFParser:
199
  data_path = self.docstore_path / f"{doc_id}_data.json"
200
  with open(data_path, 'w', encoding='utf-8') as f:
201
  json.dump(data, f, ensure_ascii=False, indent=2)
202
-
203
  self._debug_print("Data Saved", str(data_path))
204
 
205
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
206
- """Подгрузка ранее извлеченных данных из Docstore"""
207
  data_path = self.docstore_path / f"{doc_id}_data.json"
208
  try:
209
  with open(data_path, 'r', encoding='utf-8') as f:
210
  data = json.load(f)
211
  return data['text'], data['images'], data['tables']
212
  except:
213
- return "", [], []
214
-
215
- def get_all_documents(self) -> Dict:
216
- """Получение всех документов из Docstore"""
217
- all_docs = {}
218
- for json_file in self.docstore_path.glob("*_data.json"):
219
- doc_id = json_file.stem.replace("_data", "")
220
- try:
221
- with open(json_file, 'r', encoding='utf-8') as f:
222
- all_docs[doc_id] = json.load(f)
223
- except:
224
- pass
225
- return all_docs
 
 
 
 
1
  import os
2
  import json
3
  import hashlib
 
9
  import pytesseract
10
  from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
11
 
 
12
  class PDFParser:
13
  def __init__(self, debug: bool = True):
14
  self.docstore_path = Path(DOCSTORE_PATH)
15
  self.docstore_path.mkdir(exist_ok=True)
16
  self.processed_files = self._load_processed_files()
17
  self.debug = debug
18
+ self._configure_tesseract()
 
19
  if self.debug:
20
  print("PDFParser initialized")
21
 
22
+ def _configure_tesseract(self):
23
+ try:
24
+ pytesseract.get_tesseract_version()
25
+ print("Tesseract configured successfully")
26
+ except Exception as e:
27
+ print(f"Tesseract configuration warning: {e}")
28
+
29
  def _debug_print(self, label: str, data: any):
 
30
  if self.debug:
31
+ print(f"[PDF Parser] {label}")
32
  if isinstance(data, dict):
33
  for key, val in data.items():
34
+ print(f" {key}: {val}")
35
  elif isinstance(data, (list, tuple)):
36
+ print(f" Count: {len(data)}")
37
  for i, item in enumerate(data[:3]):
38
+ print(f" [{i}]: {str(item)[:100]}")
39
  else:
40
+ print(f" {data}")
41
 
42
  def _load_processed_files(self) -> Dict[str, str]:
 
43
  if os.path.exists(PROCESSED_FILES_LOG):
44
  try:
45
  with open(PROCESSED_FILES_LOG, 'r') as f:
 
49
  return {}
50
 
51
  def _save_processed_files(self):
 
52
  with open(PROCESSED_FILES_LOG, 'w') as f:
53
  json.dump(self.processed_files, f, indent=2)
54
 
55
  def _get_file_hash(self, file_path: str) -> str:
 
56
  hash_md5 = hashlib.md5()
57
  with open(file_path, "rb") as f:
58
  for chunk in iter(lambda: f.read(4096), b""):
 
60
  return hash_md5.hexdigest()
61
 
62
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
 
63
  text = ""
64
  try:
65
  with open(pdf_path, 'rb') as file:
66
  reader = PyPDF2.PdfReader(file)
67
  page_count = len(reader.pages)
68
  self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
 
69
  for page_num, page in enumerate(reader.pages):
70
  page_text = page.extract_text()
71
  text += page_text + "\n"
72
  self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
73
  except Exception as e:
74
  self._debug_print("ERROR extracting text", str(e))
 
75
  self._debug_print("Total Text Extracted", len(text))
76
  return text
77
 
78
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
 
79
  images_data = []
80
  try:
81
+ self._debug_print("Image Extraction Started", f"File: {pdf_path}")
 
82
  images = convert_from_path(pdf_path, dpi=150)
83
+ self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
 
84
  for idx, image in enumerate(images):
85
+ self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
 
86
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
87
  image.save(image_path)
88
  self._debug_print(f"Image {idx} Saved", str(image_path))
89
+ self._debug_print(f"Image {idx} OCR")
 
 
90
  try:
91
  ocr_text = pytesseract.image_to_string(image, lang='rus')
 
92
  ocr_text = ocr_text.strip()
 
93
  if not ocr_text or len(ocr_text) < 5:
94
+ self._debug_print(f"Image {idx} OCR Result", f"EMPTY ({len(ocr_text)} chars)")
95
  else:
96
+ self._debug_print(f"Image {idx} OCR Result", f"Success - {len(ocr_text)} chars: {ocr_text[:150]}")
 
97
  except Exception as ocr_error:
98
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
99
+ ocr_text = f"Image {idx}: OCR failed - {str(ocr_error)}"
 
100
  images_data.append({
101
  'page': idx,
102
  'path': str(image_path),
 
105
  })
106
  except Exception as e:
107
  self._debug_print("ERROR extracting images", str(e))
 
108
  self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
109
  return images_data
110
 
111
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
 
112
  tables_data = []
113
  try:
114
  text = self._extract_text_from_pdf(pdf_path)
115
  lines = text.split('\n')
116
+ self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
 
 
117
  current_table = []
118
  for line in lines:
119
  if '|' in line or '\t' in line:
 
125
  'description': f"Table {len(tables_data) + 1}"
126
  })
127
  current_table = []
 
128
  if current_table and len(current_table) > 1:
129
  tables_data.append({
130
  'content': '\n'.join(current_table),
131
  'description': f"Table {len(tables_data) + 1}"
132
  })
 
133
  self._debug_print("Tables Found", len(tables_data))
134
  except Exception as e:
135
  self._debug_print("ERROR extracting tables", str(e))
 
136
  return tables_data
137
 
138
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
 
139
  file_hash = self._get_file_hash(pdf_path)
140
  doc_id = Path(pdf_path).stem
 
141
  self._debug_print("PDF Parsing Started", f"File: {doc_id}")
 
142
  if doc_id in self.processed_files:
143
  if self.processed_files[doc_id] == file_hash:
144
  self._debug_print("Status", f"File {doc_id} already processed")
145
  return self._load_extracted_data(doc_id)
146
+ print(f"Processing PDF: {doc_id}")
 
 
147
  text = self._extract_text_from_pdf(pdf_path)
148
  images = self._extract_images_from_pdf(pdf_path, doc_id)
149
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
150
+ self._debug_print("Extraction Summary", {
 
151
  'text_length': len(text),
152
  'images_count': len(images),
153
  'tables_count': len(tables),
154
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
155
  })
 
156
  self._save_extracted_data(doc_id, text, images, tables)
 
157
  self.processed_files[doc_id] = file_hash
158
  self._save_processed_files()
 
159
  return text, images, tables
160
 
161
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
 
162
  data = {
163
  'text': text,
164
  'images': images,
 
167
  data_path = self.docstore_path / f"{doc_id}_data.json"
168
  with open(data_path, 'w', encoding='utf-8') as f:
169
  json.dump(data, f, ensure_ascii=False, indent=2)
 
170
  self._debug_print("Data Saved", str(data_path))
171
 
172
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
 
173
  data_path = self.docstore_path / f"{doc_id}_data.json"
174
  try:
175
  with open(data_path, 'r', encoding='utf-8') as f:
176
  data = json.load(f)
177
  return data['text'], data['images'], data['tables']
178
  except:
179
+ return "", [], []
 
 
 
 
 
 
 
 
 
 
 
 
src/rag_system.py CHANGED
@@ -1,55 +1,38 @@
1
- """
2
- RAG основной pipeline
3
- """
4
  from typing import List, Dict
5
  from langchain_openai import ChatOpenAI
6
- from langchain_core.messages import HumanMessage, SystemMessage
7
  import base64
8
  import os
9
  from pathlib import Path
10
  from config import (
11
- OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
12
  LANGUAGE, CHROMA_DB_PATH
13
  )
14
 
15
-
16
  class VisualMultimodalRAG:
17
- """
18
- RAG - подготовительный этап:
19
- 1. Кодирует изображение в base64 и отправляет в gpt-4o-mini
20
- 2. Получает описание изображения
21
- 3. Сохраняет описание в векторное хранилище
22
- """
23
-
24
  def __init__(self, api_key: str = None, debug: bool = True):
25
  api_key = api_key or OPENAI_API_KEY
26
  self.debug = debug
27
-
28
  self.llm = ChatOpenAI(
29
- model_name=OPENAI_MODEL,
30
  api_key=api_key,
31
  temperature=TEMPERATURE,
32
  max_tokens=MAX_TOKENS,
33
  )
34
-
35
  self.language = LANGUAGE
36
- self.visual_summaries_log = []
37
-
38
  if self.debug:
39
- print(f"VisualMultimodalRAG with {OPENAI_MODEL}")
40
 
41
  def _debug_print(self, label: str, data: any):
42
- """Debug"""
43
  if self.debug:
44
- print(f"\nDEBUG [{label}]:")
45
  if isinstance(data, (list, dict)):
46
- print(f" Type: {type(data).__name__}")
47
- print(f" Content: {str(data)[:300]}...")
48
  else:
49
- print(f" {data}")
50
 
51
  def _image_to_base64(self, image_path: str) -> str:
52
- """Конвертирует изображение в base64"""
53
  try:
54
  with open(image_path, 'rb') as image_file:
55
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
@@ -59,17 +42,12 @@ class VisualMultimodalRAG:
59
  return None
60
 
61
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
62
- """
63
- Отправляет в модель изображение для суммаризации
64
- """
65
  if not os.path.exists(image_path):
66
- return f"[Image {image_idx}: File not found - {image_path}]"
67
-
68
  try:
69
  image_base64 = self._image_to_base64(image_path)
70
  if not image_base64:
71
- return f"[Image {image_idx}: Error converting to base64]"
72
-
73
  file_ext = Path(image_path).suffix.lower()
74
  media_type_map = {
75
  '.jpg': 'image/jpeg',
@@ -79,9 +57,7 @@ class VisualMultimodalRAG:
79
  '.webp': 'image/webp'
80
  }
81
  media_type = media_type_map.get(file_ext, 'image/png')
82
-
83
- print(f" Analyzing image {image_idx}...")
84
-
85
  message = HumanMessage(
86
  content=[
87
  {
@@ -92,86 +68,62 @@ class VisualMultimodalRAG:
92
  },
93
  {
94
  "type": "text",
95
- "text": f"""Ты - ассистент по сбору и обобщению информации. Проанализируй изображение.
96
-
97
- По результатам анализа предоставь информацию:
98
- 1. Что изображено на картинке - основные объекты и элементы
99
- 2. Тип данных и содержимое - числа, графики, зависимости.
100
- 3. Назначение изображения - для чего оно представлено и что отображает
101
- 4. Связь с текстом
102
-
103
- Будь краток и содержателен. Фокусируйся на визуальной информации.
104
-
105
- Результат:"""
106
  }
107
  ],
108
  )
109
-
110
  response = self.llm.invoke([message])
111
  analysis = response.content.strip()
112
-
113
  if self.debug:
114
  self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
115
-
116
- print(f" Image {image_idx} analyzed successfully")
117
  return analysis
118
-
119
  except Exception as e:
120
- error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
121
- print(f" Error analyzing image {image_idx}: {e}")
122
  return error_msg
123
 
124
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
125
- """
126
- Считывает изображения и отправляет на анализ
127
- """
128
  visual_analyses = []
129
-
130
  for idx, image in enumerate(images):
131
  image_path = image.get('path', '')
132
-
133
  if not image_path:
134
- print(f" Image {idx}: No path")
135
  continue
136
-
137
  visual_analysis = self.analyze_image_visually(image_path, idx)
138
-
139
  visual_analyses.append({
140
  'type': 'image_visual',
141
  'image_index': idx,
142
  'image_path': image_path,
143
  'visual_analysis': visual_analysis,
144
- 'ocr_text': image.get('ocr_text', '')
145
  })
146
-
147
  return visual_analyses
148
 
149
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
150
- """
151
- Отправляет куски текста на суммаризацию
152
- """
153
  chunks = []
154
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
155
-
156
  self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
157
-
158
  for idx, chunk in enumerate(text_chunks):
159
  if len(chunk.strip()) < 50:
160
  continue
161
-
162
  try:
163
- prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующий кусок текста.
164
- Выдели основные моменты, факты и идеи. Будь краток.
165
-
166
- Текст :
167
  {chunk}
168
-
169
- Результат:"""
170
-
171
  message = HumanMessage(content=prompt)
172
  response = self.llm.invoke([message])
173
  summary = response.content.strip()
174
-
175
  chunks.append({
176
  'type': 'text_chunk',
177
  'chunk_index': len(chunks),
@@ -179,40 +131,27 @@ class VisualMultimodalRAG:
179
  'summary': summary,
180
  'chunk_length': len(chunk)
181
  })
182
-
183
  if self.debug:
184
  self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
185
-
186
  except Exception as e:
187
  print(f"Error summarizing text chunk: {e}")
188
-
189
  return chunks
190
 
191
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
192
- """
193
- Отправляет таблицы на суммаризацию
194
- """
195
  summaries = []
196
-
197
  for idx, table in enumerate(tables):
198
  table_content = table.get('content', '')
199
-
200
  if not table_content or len(table_content.strip()) < 10:
201
  continue
202
-
203
  try:
204
- prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующию таблицу.
205
- Выдели основные моменты, числа, и значения строк/колонок. Будь краток.
206
-
207
- Таблица:
208
  {table_content}
209
-
210
- Результат:"""
211
-
212
  message = HumanMessage(content=prompt)
213
  response = self.llm.invoke([message])
214
  summary = response.content.strip()
215
-
216
  summaries.append({
217
  'type': 'table',
218
  'table_index': idx,
@@ -220,29 +159,21 @@ class VisualMultimodalRAG:
220
  'summary': summary,
221
  'table_length': len(table_content)
222
  })
223
-
224
  if self.debug:
225
  self._debug_print(f"Table {idx} Summary", summary)
226
-
227
  except Exception as e:
228
  print(f"Error summarizing table {idx}: {e}")
229
-
230
  return summaries
231
 
232
  def process_and_store_document(
233
- self,
234
- text: str,
235
  images: List[Dict],
236
  tables: List[Dict],
237
  vector_store,
238
  doc_id: str
239
  ) -> Dict:
240
- """
241
- Основной pipeline анализирует и сохраняет документы в хранилище
242
- """
243
-
244
- print(f"PROCESSING ANALYSIS: {doc_id}")
245
-
246
  results = {
247
  'doc_id': doc_id,
248
  'image_visual_analyses': [],
@@ -250,53 +181,42 @@ class VisualMultimodalRAG:
250
  'table_summaries': [],
251
  'total_stored': 0
252
  }
253
-
254
- print(f"\n VISUAL IMAGE ANALYSIS ({len(images)} )")
255
-
256
-
257
  image_analyses = self.analyze_images_visually(images)
258
  results['image_visual_analyses'] = image_analyses
259
-
260
  image_docs = {
261
  'text': ' | '.join([
262
- f"Image {a['image_index']}: {a['visual_analysis']}"
263
  for a in image_analyses
264
  ]),
265
  'images': [],
266
  'tables': []
267
  }
268
-
269
  for analysis in image_analyses:
270
- print(f" Image {analysis['image_index']}")
271
- print(f" Path: {analysis['image_path']}")
272
- print(f" Analysis: {analysis['visual_analysis'][:100]}...")
273
-
274
  if image_analyses:
275
  try:
276
  vector_store.add_documents(
277
- image_docs,
278
  f"{doc_id}_images_visual"
279
  )
280
  results['total_stored'] += len(image_analyses)
281
- print(f" Stored {len(image_analyses)} imagу analyses")
282
  except Exception as e:
283
  print(f"Error storing image analyses: {e}")
284
-
285
- print(f"\n TEXT CHUNK SUMMARIZATION")
286
-
287
  text_summaries = self.summarize_text_chunks(text)
288
  results['text_summaries'] = text_summaries
289
-
290
  text_docs = {
291
- 'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
292
- for s in text_summaries]),
293
  'images': [],
294
  'tables': []
295
  }
296
-
297
  for summary in text_summaries:
298
- print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
299
-
300
  if text_summaries:
301
  try:
302
  vector_store.add_documents(
@@ -304,25 +224,20 @@ class VisualMultimodalRAG:
304
  f"{doc_id}_text_chunks"
305
  )
306
  results['total_stored'] += len(text_summaries)
307
- print(f" Stored {len(text_summaries)} text chunk summaries")
308
  except Exception as e:
309
- print(f" Error text summaries: {e}")
310
-
311
- print(f"\n TABLE SUMMARIZATION ({len(tables)}")
312
-
313
  table_summaries = self.summarize_tables(tables)
314
  results['table_summaries'] = table_summaries
315
-
316
  table_docs = {
317
- 'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
318
- for s in table_summaries]),
319
  'images': [],
320
  'tables': []
321
  }
322
-
323
  for summary in table_summaries:
324
- print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
325
-
326
  if table_summaries:
327
  try:
328
  vector_store.add_documents(
@@ -330,17 +245,14 @@ class VisualMultimodalRAG:
330
  f"{doc_id}_tables"
331
  )
332
  results['total_stored'] += len(table_summaries)
333
- print(f" Stored {len(table_summaries)} table summaries")
334
  except Exception as e:
335
- print(f" Error storing table summaries: {e}")
336
-
337
- print(f" STORAGE SUMMARY")
338
- print(f" Images analyzed: {len(image_analyses)}")
339
- print(f" Text chunks summarized: {len(text_summaries)}")
340
- print(f" Tables summarized: {len(table_summaries)}")
341
- print(f" Total items stored in vector: {results['total_stored']}")
342
-
343
- self.visual_summaries_log.append(results)
344
  return results
345
 
346
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
@@ -352,73 +264,41 @@ class VisualMultimodalRAG:
352
  start = end - overlap
353
  return chunks
354
 
355
- def get_visual_summaries_log(self) -> List[Dict]:
356
- return self.visual_summaries_log
357
-
358
-
359
  class AnsweringRAG:
360
- """
361
- RAG - работа с ответом на запрос:
362
- 1. Поиск в векторном хранилище
363
- 2. Анализ результатов
364
- 3. Предоставление ответа
365
- """
366
-
367
  def __init__(self, api_key: str = None, debug: bool = True):
368
  api_key = api_key or OPENAI_API_KEY
369
  self.debug = debug
370
-
371
  self.llm = ChatOpenAI(
372
- model_name=OPENAI_MODEL,
373
  api_key=api_key,
374
  temperature=TEMPERATURE,
375
  max_tokens=MAX_TOKENS,
376
  )
377
-
378
  self.language = LANGUAGE
379
- self.answer_log = []
380
-
381
  if self.debug:
382
- print(" AnsweringRAG initialized")
383
 
384
  def _debug_print(self, label: str, data: any):
385
- """Debug"""
386
  if self.debug:
387
- print(f"\n🔍 DEBUG [{label}]:")
388
  if isinstance(data, (list, dict)):
389
- print(f" Type: {type(data).__name__}")
390
- print(f" Content: {str(data)[:300]}...")
391
  else:
392
- print(f" {data}")
393
 
394
  def analyze_and_answer(
395
- self,
396
- question: str,
397
  search_results: List[Dict]
398
  ) -> Dict:
399
- """
400
- Проанализируй найденные документов и на основе их предоставь ответ на вопрос пользователя
401
-
402
- Ответ:
403
- {
404
- 'question': user question,
405
- 'answer': detailed answer,
406
- 'sources_used': number of sources,
407
- 'confidence': low/medium/high,
408
- 'search_results': original search results
409
- }
410
- """
411
-
412
- print(f"ANALYZING QUESTION & GENERATING ANSWER")
413
-
414
- print(f"\n Question: {question}")
415
- print(f" Search Results: {len(search_results)}")
416
-
417
  if not search_results:
418
- print(f" No search results found!")
419
- answer = f"""Релевантная информация в документах отсутствует: "{question}"
420
  """
421
-
422
  result = {
423
  'question': question,
424
  'answer': answer,
@@ -426,72 +306,54 @@ class AnsweringRAG:
426
  'confidence': 'low',
427
  'search_results': []
428
  }
429
- self.answer_log.append(result)
430
  return result
431
-
432
  context_parts = []
433
  for idx, result in enumerate(search_results, 1):
434
  content = result.get('content', '')
435
- metadata = result.get('metadata', {})
436
  content_type = result.get('type', 'unknown')
437
  distance = result.get('distance', 0)
438
  relevance = 1 - distance if distance else 0
439
-
440
  context_parts.append(f"""
441
  [Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
442
  {content}""")
443
-
444
  full_context = "\n".join(context_parts)
445
-
446
- self._debug_print("Context Prepared", f"{len(context_parts)} sources")
447
-
448
- analysis_prompt = f"""Ты - ассистент по анализу документов и ответов на вопросы по ним.
449
-
450
- ВОПРОС:
451
  "{question}"
452
-
453
- РЕЛЕВАНТНАЯ ИНФОРМАЦИЯ:
454
  {full_context}
455
-
456
- ИНСТРУКЦИИ:
457
- 1. Проанализируй предоставленный контент
458
- 2. Выдели информацию имеющую отношение к вопросу
459
- 3. Предоставь понятный и исчерпывающий ответ
460
- 4. Если контент полностью не отвечает на вопрос предосавь информацию которая доступна в контенте
461
- 5. Построй свой ответ опираясь на ключевые моменты
462
-
463
- Ответ:"""
464
-
465
- print(f"\n Analyzing search results...")
466
- print(f" Context size: {len(full_context)} chars")
467
- print(f" Sources: {len(search_results)}")
468
-
469
  try:
470
  message = HumanMessage(content=analysis_prompt)
471
  response = self.llm.invoke([message])
472
  answer = response.content.strip()
473
-
474
  confidence = self._estimate_confidence(len(search_results), answer)
475
-
476
- print(f" Answer generated successfully")
477
- print(f" Confidence: {confidence}")
478
- print(f" Answer length: {len(answer)} chars")
479
-
480
  result = {
481
  'question': question,
482
  'answer': answer,
483
  'sources_used': len(search_results),
484
  'confidence': confidence,
485
- 'search_results': search_results
 
486
  }
487
-
488
- self.answer_log.append(result)
489
  return result
490
-
491
  except Exception as e:
492
- print(f" Error generating answer: {e}")
493
- answer = f"Error while analyzing the search results."
494
-
495
  result = {
496
  'question': question,
497
  'answer': answer,
@@ -500,19 +362,24 @@ class AnsweringRAG:
500
  'error': str(e),
501
  'search_results': search_results
502
  }
503
-
504
- self.answer_log.append(result)
505
  return result
506
 
507
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
508
- """Уверенность в ответе на основании найденных источников информации"""
509
  answer_length = len(answer)
510
-
511
  if sources_count >= 3 and answer_length > 500:
512
  return "high"
513
-
514
  elif sources_count >= 2 and answer_length > 200:
515
  return "medium"
516
-
517
  else:
518
  return "low"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import List, Dict
2
  from langchain_openai import ChatOpenAI
3
+ from langchain_core.messages import HumanMessage
4
  import base64
5
  import os
6
  from pathlib import Path
7
  from config import (
8
+ OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
9
  LANGUAGE, CHROMA_DB_PATH
10
  )
11
 
 
12
  class VisualMultimodalRAG:
 
 
 
 
 
 
 
13
  def __init__(self, api_key: str = None, debug: bool = True):
14
  api_key = api_key or OPENAI_API_KEY
15
  self.debug = debug
 
16
  self.llm = ChatOpenAI(
17
+ model_name="gpt-4o-mini",
18
  api_key=api_key,
19
  temperature=TEMPERATURE,
20
  max_tokens=MAX_TOKENS,
21
  )
 
22
  self.language = LANGUAGE
 
 
23
  if self.debug:
24
+ print("VisualMultimodalRAG initialized")
25
 
26
  def _debug_print(self, label: str, data: any):
 
27
  if self.debug:
28
+ print(f"DEBUG [{label}]:")
29
  if isinstance(data, (list, dict)):
30
+ print(f" Type: {type(data).__name__}")
31
+ print(f" Content: {str(data)[:300]}...")
32
  else:
33
+ print(f" {data}")
34
 
35
  def _image_to_base64(self, image_path: str) -> str:
 
36
  try:
37
  with open(image_path, 'rb') as image_file:
38
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
 
42
  return None
43
 
44
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
 
 
 
45
  if not os.path.exists(image_path):
46
+ return f"Image {image_idx}: File not found - {image_path}"
 
47
  try:
48
  image_base64 = self._image_to_base64(image_path)
49
  if not image_base64:
50
+ return f"Image {image_idx}: Could not convert to base64"
 
51
  file_ext = Path(image_path).suffix.lower()
52
  media_type_map = {
53
  '.jpg': 'image/jpeg',
 
57
  '.webp': 'image/webp'
58
  }
59
  media_type = media_type_map.get(file_ext, 'image/png')
60
+ print(f"Analyzing image {image_idx} visually (as {media_type})...")
 
 
61
  message = HumanMessage(
62
  content=[
63
  {
 
68
  },
69
  {
70
  "type": "text",
71
+ "text": f"""You are assistant for analyzing and aggregating information. Analyze this image.
72
+ Provide a visual analysis that includes:
73
+ 1. Main objects and element
74
+ 2. Data/Content - Any numbers, text, charts, graphs
75
+ 3. What this image is showing or representing
76
+ 4. Important patterns, trends, or information
77
+ 5. How image relates to document content
78
+ Be brief and meaningful. Focus on visual information that cannot be extracted from text. Response on {self.language}.
79
+ Analysis:"""
 
 
80
  }
81
  ],
82
  )
 
83
  response = self.llm.invoke([message])
84
  analysis = response.content.strip()
 
85
  if self.debug:
86
  self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
87
+ print(f"Image {image_idx} analyzed successfully")
 
88
  return analysis
 
89
  except Exception as e:
90
+ error_msg = f"Image {image_idx}: Vision analysis failed - {str(e)}"
91
+ print(f"Error analyzing image {image_idx}: {e}")
92
  return error_msg
93
 
94
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
 
 
 
95
  visual_analyses = []
 
96
  for idx, image in enumerate(images):
97
  image_path = image.get('path', '')
 
98
  if not image_path:
99
+ print(f"Image {idx}: No path provided")
100
  continue
 
101
  visual_analysis = self.analyze_image_visually(image_path, idx)
 
102
  visual_analyses.append({
103
  'type': 'image_visual',
104
  'image_index': idx,
105
  'image_path': image_path,
106
  'visual_analysis': visual_analysis,
107
+ 'ocr_text': image.get('ocr_text', '')
108
  })
 
109
  return visual_analyses
110
 
111
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
 
 
 
112
  chunks = []
113
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
 
114
  self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
 
115
  for idx, chunk in enumerate(text_chunks):
116
  if len(chunk.strip()) < 50:
117
  continue
 
118
  try:
119
+ prompt = f"""Summarize this text chunk in {self.language}.
120
+ Be brief and meaningful. Extract key points, facts, and main ideas.
121
+ Text Chunk:
 
122
  {chunk}
123
+ Summary:"""
 
 
124
  message = HumanMessage(content=prompt)
125
  response = self.llm.invoke([message])
126
  summary = response.content.strip()
 
127
  chunks.append({
128
  'type': 'text_chunk',
129
  'chunk_index': len(chunks),
 
131
  'summary': summary,
132
  'chunk_length': len(chunk)
133
  })
 
134
  if self.debug:
135
  self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
 
136
  except Exception as e:
137
  print(f"Error summarizing text chunk: {e}")
 
138
  return chunks
139
 
140
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
 
 
 
141
  summaries = []
 
142
  for idx, table in enumerate(tables):
143
  table_content = table.get('content', '')
 
144
  if not table_content or len(table_content.strip()) < 10:
145
  continue
 
146
  try:
147
+ prompt = f"""Analyze and summarize this table/structured data in {self.language}.
148
+ Extract key insights, row/column meanings, and important figures. Be brief and meaningful.
149
+ Table Content:
 
150
  {table_content}
151
+ Summary:"""
 
 
152
  message = HumanMessage(content=prompt)
153
  response = self.llm.invoke([message])
154
  summary = response.content.strip()
 
155
  summaries.append({
156
  'type': 'table',
157
  'table_index': idx,
 
159
  'summary': summary,
160
  'table_length': len(table_content)
161
  })
 
162
  if self.debug:
163
  self._debug_print(f"Table {idx} Summary", summary)
 
164
  except Exception as e:
165
  print(f"Error summarizing table {idx}: {e}")
 
166
  return summaries
167
 
168
  def process_and_store_document(
169
+ self,
170
+ text: str,
171
  images: List[Dict],
172
  tables: List[Dict],
173
  vector_store,
174
  doc_id: str
175
  ) -> Dict:
176
+ print("PROCESSING WITH VISUAL IMAGE ANALYSIS: " + doc_id)
 
 
 
 
 
177
  results = {
178
  'doc_id': doc_id,
179
  'image_visual_analyses': [],
 
181
  'table_summaries': [],
182
  'total_stored': 0
183
  }
184
+ print("VISUAL IMAGE ANALYSIS (" + str(len(images)) + " total)")
 
 
 
185
  image_analyses = self.analyze_images_visually(images)
186
  results['image_visual_analyses'] = image_analyses
 
187
  image_docs = {
188
  'text': ' | '.join([
189
+ f"Image {a['image_index']}: {a['visual_analysis']}"
190
  for a in image_analyses
191
  ]),
192
  'images': [],
193
  'tables': []
194
  }
 
195
  for analysis in image_analyses:
196
+ print(f" Image {analysis['image_index']} (visual analysis)")
197
+ print(f" Path: {analysis['image_path']}")
198
+ print(f" Analysis: {analysis['visual_analysis'][:100]}...")
 
199
  if image_analyses:
200
  try:
201
  vector_store.add_documents(
202
+ image_docs,
203
  f"{doc_id}_images_visual"
204
  )
205
  results['total_stored'] += len(image_analyses)
206
+ print(f"Stored {len(image_analyses)} image visual analyses")
207
  except Exception as e:
208
  print(f"Error storing image analyses: {e}")
209
+ print("TEXT CHUNK SUMMARIZATION")
 
 
210
  text_summaries = self.summarize_text_chunks(text)
211
  results['text_summaries'] = text_summaries
 
212
  text_docs = {
213
+ 'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
214
+ for s in text_summaries]),
215
  'images': [],
216
  'tables': []
217
  }
 
218
  for summary in text_summaries:
219
+ print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
 
220
  if text_summaries:
221
  try:
222
  vector_store.add_documents(
 
224
  f"{doc_id}_text_chunks"
225
  )
226
  results['total_stored'] += len(text_summaries)
227
+ print(f"Stored {len(text_summaries)} text chunk summaries")
228
  except Exception as e:
229
+ print(f"Error storing text summaries: {e}")
230
+ print("TABLE SUMMARIZATION (" + str(len(tables)) + " total)")
 
 
231
  table_summaries = self.summarize_tables(tables)
232
  results['table_summaries'] = table_summaries
 
233
  table_docs = {
234
+ 'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
235
+ for s in table_summaries]),
236
  'images': [],
237
  'tables': []
238
  }
 
239
  for summary in table_summaries:
240
+ print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
 
241
  if table_summaries:
242
  try:
243
  vector_store.add_documents(
 
245
  f"{doc_id}_tables"
246
  )
247
  results['total_stored'] += len(table_summaries)
248
+ print(f"Stored {len(table_summaries)} table summaries")
249
  except Exception as e:
250
+ print(f"Error storing table summaries: {e}")
251
+ print("STORAGE SUMMARY")
252
+ print(f" Images analyzed and stored: {len(image_analyses)}")
253
+ print(f" Text chunks summarized and stored: {len(text_summaries)}")
254
+ print(f" Tables summarized and stored: {len(table_summaries)}")
255
+ print(f" Total items stored in vector: {results['total_stored']}")
 
 
 
256
  return results
257
 
258
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
 
264
  start = end - overlap
265
  return chunks
266
 
 
 
 
 
267
  class AnsweringRAG:
 
 
 
 
 
 
 
268
  def __init__(self, api_key: str = None, debug: bool = True):
269
  api_key = api_key or OPENAI_API_KEY
270
  self.debug = debug
 
271
  self.llm = ChatOpenAI(
272
+ model_name="gpt-4o-mini",
273
  api_key=api_key,
274
  temperature=TEMPERATURE,
275
  max_tokens=MAX_TOKENS,
276
  )
 
277
  self.language = LANGUAGE
 
 
278
  if self.debug:
279
+ print("AnsweringRAG initialized")
280
 
281
  def _debug_print(self, label: str, data: any):
 
282
  if self.debug:
283
+ print(f"DEBUG [{label}]:")
284
  if isinstance(data, (list, dict)):
285
+ print(f" Type: {type(data).__name__}")
286
+ print(f" Content: {str(data)[:300]}...")
287
  else:
288
+ print(f" {data}")
289
 
290
  def analyze_and_answer(
291
+ self,
292
+ question: str,
293
  search_results: List[Dict]
294
  ) -> Dict:
295
+ print("ANALYZING QUESTION & GENERATING ANSWER")
296
+ print(f"Question: {question}")
297
+ print(f"Search Results Found: {len(search_results)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  if not search_results:
299
+ print("No search results found!")
300
+ answer = f"""No relevant information in the document to answer question: "{question}"
301
  """
 
302
  result = {
303
  'question': question,
304
  'answer': answer,
 
306
  'confidence': 'low',
307
  'search_results': []
308
  }
 
309
  return result
 
310
  context_parts = []
311
  for idx, result in enumerate(search_results, 1):
312
  content = result.get('content', '')
 
313
  content_type = result.get('type', 'unknown')
314
  distance = result.get('distance', 0)
315
  relevance = 1 - distance if distance else 0
 
316
  context_parts.append(f"""
317
  [Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
318
  {content}""")
 
319
  full_context = "\n".join(context_parts)
320
+ self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
321
+ analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
322
+ USER QUESTION:
 
 
 
323
  "{question}"
324
+ RELEVANT CONTENT FROM DOCUMENT:
 
325
  {full_context}
326
+ INSTRUCTIONS:
327
+ 1. Analyze the provided content carefully
328
+ 2. Extract information relevant to the question
329
+ 3. Synthesize a clear, comprehensive answer in {self.language}
330
+ 4. If the content doesn't fully answer the question, explain what information is available
331
+ 5. Be specific and cite the content when relevant
332
+ 6. Structure your answer clearly with key points
333
+ ANSWER:"""
334
+ print("Analyzing search results...")
335
+ print(f" Context size: {len(full_context)} characters")
336
+ print(f" Sources: {len(search_results)}")
 
 
 
337
  try:
338
  message = HumanMessage(content=analysis_prompt)
339
  response = self.llm.invoke([message])
340
  answer = response.content.strip()
 
341
  confidence = self._estimate_confidence(len(search_results), answer)
342
+ print("Answer generated successfully")
343
+ print(f" Confidence: {confidence}")
344
+ print(f" Answer length: {len(answer)} characters")
 
 
345
  result = {
346
  'question': question,
347
  'answer': answer,
348
  'sources_used': len(search_results),
349
  'confidence': confidence,
350
+ 'search_results': search_results,
351
+ 'formatted_sources': self._format_sources(search_results)
352
  }
 
 
353
  return result
 
354
  except Exception as e:
355
+ print(f"Error generating answer: {e}")
356
+ answer = "Error while analyzing the search results"
 
357
  result = {
358
  'question': question,
359
  'answer': answer,
 
362
  'error': str(e),
363
  'search_results': search_results
364
  }
 
 
365
  return result
366
 
367
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
 
368
  answer_length = len(answer)
 
369
  if sources_count >= 3 and answer_length > 500:
370
  return "high"
 
371
  elif sources_count >= 2 and answer_length > 200:
372
  return "medium"
 
373
  else:
374
  return "low"
375
+
376
+ def _format_sources(self, search_results: List[Dict]) -> List[Dict]:
377
+ formatted_sources = []
378
+ for idx, source in enumerate(search_results, 1):
379
+ formatted_sources.append({
380
+ 'index': idx,
381
+ 'type': source.get('type', 'unknown'),
382
+ 'content': source.get('content', ''),
383
+ 'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
384
+ })
385
+ return formatted_sources
src/vector_store.py CHANGED
@@ -1,209 +1,88 @@
1
- """
2
- Векторное хранилище и Эмбеддер"
3
- """
4
  import os
5
- import json
6
  from typing import List, Dict
 
7
  import chromadb
8
- from sentence_transformers import SentenceTransformer
9
- import numpy as np
10
- from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
11
-
12
-
13
- class CLIPEmbedder:
14
- """Эмбеддер"""
15
- def __init__(self, model_name: str = EMBEDDING_MODEL):
16
- print(f"Embedding model: {model_name}")
17
- self.model = SentenceTransformer(model_name)
18
- print(f"Model loaded successfully")
19
-
20
- def embed(self, text: str) -> List[float]:
21
- """Эмбеддинг для текста"""
22
- try:
23
- embedding = self.model.encode(text, convert_to_numpy=False)
24
- return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
25
- except Exception as e:
26
- print(f"Error embedding text: {e}")
27
- return [0.0] * EMBEDDING_DIM
28
-
29
- def embed_batch(self, texts: List[str]) -> List[List[float]]:
30
- """Эмбеддинг для текста"""
31
- try:
32
- embeddings = self.model.encode(texts, convert_to_numpy=False)
33
- return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
34
- except Exception as e:
35
- print(f"Error embedding batch: {e}")
36
- return [[0.0] * EMBEDDING_DIM] * len(texts)
37
-
38
 
39
  class VectorStore:
40
- """Векторное хранилище"""
41
  def __init__(self):
42
- self.persist_directory = CHROMA_DB_PATH
43
- self.embedder = CLIPEmbedder()
44
-
45
- print(f"\nInitializing ChromaDB: {self.persist_directory}")
46
-
 
 
 
 
 
 
 
 
 
47
  try:
48
- self.client = chromadb.PersistentClient(
49
- path=self.persist_directory
 
 
 
 
 
 
 
 
 
50
  )
51
- print(f"ChromaDB initialized")
52
  except Exception as e:
53
- print(f"Error initializing ChromaDB: {e}")
54
- self.client = chromadb.PersistentClient(
55
- path=self.persist_directory
56
- )
57
-
58
- try:
59
- self.collection = self.client.get_or_create_colletion(
60
- name="multimodal_rag",
61
- metadata={"hnsw:space": "cosine"}
62
- )
63
- count = self.collection.count()
64
- print(f"Collection loaded: {count} items in store")
65
- except Exception as e:
66
- print(f"Error with collection: {e}")
67
- self.collection = self.client.get_or_create_collection(
68
- name="multimodal_rag"
69
- )
70
-
71
- def add_documents(self, documents: List[Dict], doc_id: str):
72
- """Добавление документов в векторное хранилище"""
73
- texts = []
74
- metadatas = []
75
- ids = []
76
-
77
- print(f"\nAdding document: {doc_id}")
78
-
79
- if 'text' in documents and documents['text']:
80
- chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
81
- for idx, chunk in enumerate(chunks):
82
- texts.append(chunk)
83
- metadatas.append({
84
- 'doc_id': doc_id,
85
- 'type': 'text',
86
- 'chunk_idx': str(idx)
87
- })
88
- ids.append(f"{doc_id}_text_{idx}")
89
- print(f" Text: {len(chunks)} chunks")
90
-
91
- if 'images' in documents:
92
- image_count = 0
93
- for idx, image_data in enumerate(documents['images']):
94
- if image_data.get('ocr_text'):
95
- texts.append(f"Image {idx}: {image_data['ocr_text']}")
96
- metadatas.append({
97
- 'doc_id': doc_id,
98
- 'type': 'image',
99
- 'image_idx': str(idx),
100
- 'image_path': image_data.get('path', '')
101
- })
102
- ids.append(f"{doc_id}_image_{idx}")
103
- image_count += 1
104
- if image_count > 0:
105
- print(f" Images: {image_count} with OCR text")
106
-
107
- if 'tables' in documents:
108
- table_count = 0
109
- for idx, table_data in enumerate(documents['tables']):
110
- if table_data.get('content'):
111
- texts.append(f"Table {idx}: {table_data.get('content', '')}")
112
- metadatas.append({
113
- 'doc_id': doc_id,
114
- 'type': 'table',
115
- 'table_idx': str(idx)
116
- })
117
- ids.append(f"{doc_id}_table_{idx}")
118
- table_count += 1
119
- if table_count > 0:
120
- print(f" Tables: {table_count}")
121
-
122
- if texts:
123
- print(f" 🔄 Generating {len(texts)} embeddings...")
124
- embeddings = self.embedder.embed_batch(texts)
125
-
126
- try:
127
- self.collection.add(
128
- ids=ids,
129
- documents=texts,
130
- embeddings=embeddings,
131
- metadatas=metadatas
132
- )
133
- print(f"Successfully added {len(texts)} items to vector store")
134
- except Exception as e:
135
- print(f"Error adding to collection: {e}")
136
 
137
  def search(self, query: str, n_results: int = 5) -> List[Dict]:
138
- """Поиск в векторном хранилище"""
139
  try:
140
- query_embedding = self.embedder.embed(query)
141
-
142
  results = self.collection.query(
143
- query_embeddings=[query_embedding],
144
- n_results=n_results
 
145
  )
146
-
147
  formatted_results = []
148
- if results['documents']:
149
- for i, doc in enumerate(results['documents'][0]):
150
- metadata = results['metadatas'][0][i] if results['metadatas'] else {}
151
- distance = results['distances'][0][i] if results['distances'] else 0
152
-
153
  formatted_results.append({
154
  'content': doc,
155
- 'metadata': metadata,
156
  'distance': distance,
157
- 'type': metadata.get('type', 'unknown')
158
  })
159
-
160
  return formatted_results
161
  except Exception as e:
162
  print(f"Error searching vector store: {e}")
163
  return []
164
 
165
- def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
166
- """Сплит текста"""
167
- chunks = []
168
- start = 0
169
- while start < len(text):
170
- end = start + chunk_size
171
- chunks.append(text[start:end])
172
- start = end - overlap
173
- return chunks
174
-
175
  def get_collection_info(self) -> Dict:
176
- """Получение информации о коллекции в вектороном хранилище"""
177
  try:
178
  count = self.collection.count()
179
  return {
180
- 'name': 'multimodal_rag',
181
  'count': count,
182
- 'status': 'active',
183
- 'persist_path': self.persist_directory
184
  }
185
  except Exception as e:
186
  print(f"Error getting collection info: {e}")
187
- return {'status': 'error', 'message': str(e)}
188
-
189
- def delete_by_doc_id(self, doc_id: str):
190
- """Удаление документа из векторного хранилища"""
191
- try:
192
- results = self.collection.get(where={'doc_id': doc_id})
193
- if results['ids']:
194
- self.collection.delete(ids=results['ids'])
195
- print(f"Deleted {len(results['ids'])} documents for {doc_id}")
196
- except Exception as e:
197
- print(f"Error deleting documents: {e}")
198
 
199
  def clear_all(self):
200
- """Очистка хранилища"""
201
  try:
202
- self.client.delete_collection(name="multimodal_rag")
203
  self.collection = self.client.get_or_create_collection(
204
- name="multimodal_rag",
205
  metadata={"hnsw:space": "cosine"}
206
  )
207
- print("Collection cleared")
208
  except Exception as e:
209
- print(f"Error clearing collection: {e}")
 
 
 
 
1
  import os
 
2
  from typing import List, Dict
3
+ from chromadb.config import Settings
4
  import chromadb
5
+ from config import CHROMA_DB_PATH
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  class VectorStore:
 
8
  def __init__(self):
9
+ self.chroma_path = CHROMA_DB_PATH
10
+ self.settings = Settings(
11
+ chroma_db_impl_embed_collection_mixin=True,
12
+ persist_directory=self.chroma_path,
13
+ anonymized_telemetry=False,
14
+ allow_reset=True,
15
+ )
16
+ self.client = chromadb.Client(self.settings)
17
+ self.collection = self.client.get_or_create_collection(
18
+ name="documents",
19
+ metadata={"hnsw:space": "cosine"}
20
+ )
21
+
22
+ def add_documents(self, documents: Dict, doc_id: str):
23
  try:
24
+ text = documents.get('text', '')
25
+ if not text or len(text.strip()) < 1:
26
+ print(f"Empty text for {doc_id}")
27
+ return
28
+ self.collection.add(
29
+ ids=[doc_id],
30
+ documents=[text],
31
+ metadatas=[{
32
+ 'doc_id': doc_id,
33
+ 'source': 'pdf_document'
34
+ }]
35
  )
36
+ print(f"Added document to vector store: {doc_id}")
37
  except Exception as e:
38
+ print(f"Error adding documents to vector store: {e}")
39
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def search(self, query: str, n_results: int = 5) -> List[Dict]:
 
42
  try:
 
 
43
  results = self.collection.query(
44
+ query_texts=[query],
45
+ n_results=n_results,
46
+ include=['documents', 'metadatas', 'distances', 'embeddings']
47
  )
 
48
  formatted_results = []
49
+ if results and results['documents'] and len(results['documents']) > 0:
50
+ for idx, doc in enumerate(results['documents'][0]):
51
+ distance = results['distances'][0][idx] if results['distances'] else 0
 
 
52
  formatted_results.append({
53
  'content': doc,
54
+ 'metadata': results['metadatas'][0][idx] if results['metadatas'] else {},
55
  'distance': distance,
56
+ 'type': 'document'
57
  })
 
58
  return formatted_results
59
  except Exception as e:
60
  print(f"Error searching vector store: {e}")
61
  return []
62
 
 
 
 
 
 
 
 
 
 
 
63
  def get_collection_info(self) -> Dict:
 
64
  try:
65
  count = self.collection.count()
66
  return {
 
67
  'count': count,
68
+ 'status': 'ready',
69
+ 'persist_path': self.chroma_path
70
  }
71
  except Exception as e:
72
  print(f"Error getting collection info: {e}")
73
+ return {
74
+ 'count': 0,
75
+ 'status': 'error',
76
+ 'persist_path': self.chroma_path
77
+ }
 
 
 
 
 
 
78
 
79
  def clear_all(self):
 
80
  try:
81
+ self.client.delete_collection(name="documents")
82
  self.collection = self.client.get_or_create_collection(
83
+ name="documents",
84
  metadata={"hnsw:space": "cosine"}
85
  )
86
+ print("Vector store cleared")
87
  except Exception as e:
88
+ print(f"Error clearing vector store: {e}")