Spaces:
Sleeping
Sleeping
Simplify
Browse files- src/app.py +82 -107
- src/config.py +11 -7
- src/pdf_parser.py +24 -70
- src/rag_system.py +106 -239
- src/vector_store.py +49 -170
src/app.py
CHANGED
|
@@ -1,33 +1,25 @@
|
|
| 1 |
-
"""
|
| 2 |
-
UI RAG
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
import streamlit as st
|
| 6 |
import os
|
| 7 |
from pathlib import Path
|
| 8 |
-
|
| 9 |
from pdf_parser import PDFParser
|
| 10 |
from vector_store import VectorStore
|
| 11 |
-
from rag_system import VisualMultimodalRAG
|
| 12 |
from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
st.set_page_config(
|
| 17 |
-
page_title="Мультимодальная
|
|
|
|
| 18 |
layout="wide",
|
| 19 |
initial_sidebar_state="expanded"
|
| 20 |
)
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
if 'api_key_set' not in st.session_state:
|
| 25 |
st.session_state.api_key_set = False
|
| 26 |
|
| 27 |
if 'api_key' not in st.session_state:
|
| 28 |
st.session_state.api_key = None
|
| 29 |
|
| 30 |
-
if 'visual_rag_system' not in st.session_state:
|
| 31 |
st.session_state.visual_rag_system = None
|
| 32 |
|
| 33 |
if 'vector_store' not in st.session_state:
|
|
@@ -54,22 +46,20 @@ if 'processing_results' not in st.session_state:
|
|
| 54 |
if 'answering_rag' not in st.session_state:
|
| 55 |
st.session_state.answering_rag = None
|
| 56 |
|
|
|
|
| 57 |
|
| 58 |
-
|
| 59 |
-
st.title("Мультимодальная RAG система (PDF parsing)")
|
| 60 |
st.markdown("""
|
| 61 |
-
Обрабатывает PDF документы и предоставляет информацию по ним
|
| 62 |
-
""")
|
| 63 |
-
|
| 64 |
|
|
|
|
|
|
|
| 65 |
|
| 66 |
with st.sidebar:
|
| 67 |
-
st.header("
|
| 68 |
|
| 69 |
-
st.subheader("
|
| 70 |
|
| 71 |
api_key = st.text_input(
|
| 72 |
-
"Введите
|
| 73 |
type="password",
|
| 74 |
key="api_key_input"
|
| 75 |
)
|
|
@@ -80,70 +70,65 @@ with st.sidebar:
|
|
| 80 |
|
| 81 |
if st.session_state.visual_rag_system is None:
|
| 82 |
try:
|
| 83 |
-
st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)
|
| 84 |
st.session_state.vector_store = VectorStore()
|
| 85 |
st.session_state.parser = PDFParser(debug=True)
|
| 86 |
-
st.success("API
|
| 87 |
except Exception as e:
|
| 88 |
-
st.error(f"Ошибка
|
| 89 |
else:
|
| 90 |
st.session_state.api_key_set = False
|
| 91 |
-
st.warning("Введите
|
| 92 |
|
| 93 |
st.divider()
|
| 94 |
|
| 95 |
st.subheader("Векторное хранилище")
|
|
|
|
| 96 |
if st.session_state.vector_store:
|
| 97 |
try:
|
| 98 |
info = st.session_state.vector_store.get_collection_info()
|
| 99 |
-
st.metric("
|
| 100 |
-
st.caption(f"
|
| 101 |
except Exception as e:
|
| 102 |
-
st.error(f"Ошибка получения
|
| 103 |
else:
|
| 104 |
-
st.info("
|
| 105 |
|
| 106 |
st.divider()
|
| 107 |
|
| 108 |
-
st.subheader("Управление
|
| 109 |
-
|
|
|
|
| 110 |
if st.session_state.vector_store:
|
| 111 |
try:
|
| 112 |
st.session_state.vector_store.clear_all()
|
| 113 |
-
st.success("
|
| 114 |
except Exception as e:
|
| 115 |
-
st.error(f"Ошибка
|
| 116 |
-
|
| 117 |
|
| 118 |
-
|
| 119 |
-
st.header("Загрузить PDF")
|
| 120 |
|
| 121 |
uploaded_file = st.file_uploader(
|
| 122 |
-
"
|
| 123 |
type=['pdf'],
|
| 124 |
-
help="
|
| 125 |
)
|
| 126 |
|
| 127 |
if uploaded_file is not None:
|
| 128 |
upload_path = Path(UPLOAD_FOLDER)
|
| 129 |
upload_path.mkdir(exist_ok=True)
|
| 130 |
-
|
| 131 |
file_path = upload_path / uploaded_file.name
|
| 132 |
with open(file_path, 'wb') as f:
|
| 133 |
f.write(uploaded_file.getbuffer())
|
| 134 |
-
|
| 135 |
-
st.success(f"Файл загружен: {uploaded_file.name}")
|
| 136 |
|
| 137 |
if st.button("Распарсить PDF"):
|
| 138 |
if not st.session_state.api_key_set:
|
| 139 |
-
st.error("Введите
|
| 140 |
else:
|
| 141 |
try:
|
| 142 |
-
with st.spinner("
|
| 143 |
-
|
| 144 |
-
print(f"Парсинг PDF файла: {uploaded_file.name}")
|
| 145 |
-
|
| 146 |
-
|
| 147 |
parser = st.session_state.parser
|
| 148 |
text, images, tables = parser.parse_pdf(str(file_path))
|
| 149 |
|
|
@@ -154,42 +139,45 @@ if uploaded_file is not None:
|
|
| 154 |
|
| 155 |
col1, col2, col3 = st.columns(3)
|
| 156 |
with col1:
|
| 157 |
-
st.metric("
|
| 158 |
with col2:
|
| 159 |
-
st.metric("
|
| 160 |
with col3:
|
| 161 |
-
st.metric("
|
| 162 |
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
|
|
|
| 165 |
except Exception as e:
|
| 166 |
-
st.error(f"
|
| 167 |
-
print(f"Ошибка: {e}")
|
| 168 |
-
|
| 169 |
-
|
| 170 |
|
| 171 |
st.divider()
|
| 172 |
-
st.header("Анализ документа")
|
| 173 |
|
| 174 |
-
st.
|
| 175 |
-
Отправляет содержимое документа на анализ
|
| 176 |
-
""")
|
| 177 |
|
| 178 |
-
|
|
|
|
| 179 |
if not st.session_state.api_key_set:
|
| 180 |
-
st.error("Введите
|
| 181 |
elif st.session_state.current_text is None:
|
| 182 |
-
st.error("
|
| 183 |
else:
|
| 184 |
try:
|
| 185 |
-
with st.spinner("Анализ с gpt-4o-mini..."):
|
| 186 |
-
|
| 187 |
visual_rag = st.session_state.visual_rag_system
|
| 188 |
vector_store = st.session_state.vector_store
|
| 189 |
-
|
| 190 |
results = visual_rag.process_and_store_document(
|
| 191 |
text=st.session_state.current_text,
|
| 192 |
-
images=st.session_state.current_images,
|
| 193 |
tables=st.session_state.current_tables,
|
| 194 |
vector_store=vector_store,
|
| 195 |
doc_id=st.session_state.current_document or "current_doc"
|
|
@@ -197,53 +185,45 @@ if st.button("Проанализировать документ"):
|
|
| 197 |
|
| 198 |
st.session_state.processing_results = results
|
| 199 |
|
| 200 |
-
st.success("Анализ
|
| 201 |
|
| 202 |
col1, col2, col3 = st.columns(3)
|
| 203 |
with col1:
|
| 204 |
st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
|
| 205 |
with col2:
|
| 206 |
-
st.metric("
|
| 207 |
with col3:
|
| 208 |
st.metric("Проанализировано та��лиц", len(results['table_summaries']))
|
| 209 |
|
| 210 |
-
st.metric("
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
print(f"Анализ завершен")
|
| 214 |
|
| 215 |
except Exception as e:
|
| 216 |
-
st.error(f"Ошибка
|
| 217 |
-
print(f"Ошибка: {e}")
|
| 218 |
-
|
| 219 |
-
|
| 220 |
|
| 221 |
st.divider()
|
| 222 |
-
st.header("Работа с документом")
|
| 223 |
|
| 224 |
-
|
| 225 |
-
st.session_state.answering_rag = None
|
| 226 |
|
| 227 |
if st.session_state.api_key_set and st.session_state.answering_rag is None:
|
| 228 |
-
from rag_system import AnsweringRAG
|
| 229 |
st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
|
| 230 |
|
| 231 |
question = st.text_area(
|
| 232 |
-
"Введите
|
| 233 |
height=100,
|
| 234 |
-
placeholder="О чем
|
| 235 |
)
|
| 236 |
|
| 237 |
-
if st.button("
|
| 238 |
if not st.session_state.api_key_set:
|
| 239 |
-
st.error("Введите
|
| 240 |
elif st.session_state.current_text is None:
|
| 241 |
-
st.error("
|
| 242 |
elif not question:
|
| 243 |
-
st.error("Введите
|
| 244 |
else:
|
| 245 |
try:
|
| 246 |
-
with st.spinner("Поиск
|
|
|
|
| 247 |
store = st.session_state.vector_store
|
| 248 |
|
| 249 |
doc_name = st.session_state.current_document or "current_doc"
|
|
@@ -252,55 +232,50 @@ if st.button("Генерация ответа"):
|
|
| 252 |
'images': [],
|
| 253 |
'tables': []
|
| 254 |
}
|
|
|
|
| 255 |
store.add_documents(doc_data, doc_name)
|
| 256 |
|
| 257 |
search_results = store.search(question, n_results=5)
|
| 258 |
|
| 259 |
-
print(f"Найдено: {len(search_results)}")
|
| 260 |
-
|
| 261 |
answering_rag = st.session_state.answering_rag
|
| 262 |
result = answering_rag.analyze_and_answer(question, search_results)
|
| 263 |
|
| 264 |
-
st.success("
|
| 265 |
|
| 266 |
st.subheader("Ответ")
|
| 267 |
|
| 268 |
col1, col2, col3 = st.columns(3)
|
| 269 |
with col1:
|
| 270 |
-
|
| 271 |
-
'high': '
|
| 272 |
-
'medium': '
|
| 273 |
-
'low': '
|
| 274 |
-
}
|
| 275 |
-
|
|
|
|
| 276 |
with col2:
|
| 277 |
st.metric("Использовано источников", result['sources_used'])
|
| 278 |
with col3:
|
| 279 |
if result['sources_used'] > 0:
|
| 280 |
-
st.metric("
|
| 281 |
|
| 282 |
st.write(result['answer'])
|
| 283 |
|
| 284 |
if st.checkbox("Показать исходные документы"):
|
| 285 |
-
st.subheader("
|
| 286 |
-
for
|
| 287 |
relevance = source['relevance']
|
| 288 |
-
relevance_bar = "
|
| 289 |
-
|
| 290 |
with st.expander(
|
| 291 |
-
f"Источник {
|
| 292 |
f"[{relevance_bar}] {relevance:.0%}"
|
| 293 |
):
|
| 294 |
st.write(source['content'])
|
| 295 |
-
|
| 296 |
-
print(f" Ответ готов!")
|
| 297 |
-
|
| 298 |
except Exception as e:
|
| 299 |
-
st.error(f"Ошибка
|
| 300 |
-
print(f"Ошибка: {e}")
|
| 301 |
|
| 302 |
st.divider()
|
| 303 |
|
| 304 |
st.caption(
|
| 305 |
-
"Мультимодальная
|
| 306 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
|
|
|
| 4 |
from pdf_parser import PDFParser
|
| 5 |
from vector_store import VectorStore
|
| 6 |
+
from rag_system import VisualMultimodalRAG, AnsweringRAG
|
| 7 |
from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
|
| 8 |
|
|
|
|
|
|
|
| 9 |
st.set_page_config(
|
| 10 |
+
page_title="Мультимодальная система RAG LLM",
|
| 11 |
+
page_icon="",
|
| 12 |
layout="wide",
|
| 13 |
initial_sidebar_state="expanded"
|
| 14 |
)
|
| 15 |
|
|
|
|
|
|
|
| 16 |
if 'api_key_set' not in st.session_state:
|
| 17 |
st.session_state.api_key_set = False
|
| 18 |
|
| 19 |
if 'api_key' not in st.session_state:
|
| 20 |
st.session_state.api_key = None
|
| 21 |
|
| 22 |
+
if 'visual_rag_system' not in st.session_state:
|
| 23 |
st.session_state.visual_rag_system = None
|
| 24 |
|
| 25 |
if 'vector_store' not in st.session_state:
|
|
|
|
| 46 |
if 'answering_rag' not in st.session_state:
|
| 47 |
st.session_state.answering_rag = None
|
| 48 |
|
| 49 |
+
st.title("Мультимодальная система RAG LLM")
|
| 50 |
|
|
|
|
|
|
|
| 51 |
st.markdown("""
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
Обработка PDF-документов с анализом визуального контента
|
| 54 |
+
""")
|
| 55 |
|
| 56 |
with st.sidebar:
|
| 57 |
+
st.header("Конфигурация")
|
| 58 |
|
| 59 |
+
st.subheader("Ключ API OpenAI")
|
| 60 |
|
| 61 |
api_key = st.text_input(
|
| 62 |
+
"Введите ваш ключ API OpenAI:",
|
| 63 |
type="password",
|
| 64 |
key="api_key_input"
|
| 65 |
)
|
|
|
|
| 70 |
|
| 71 |
if st.session_state.visual_rag_system is None:
|
| 72 |
try:
|
| 73 |
+
st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)
|
| 74 |
st.session_state.vector_store = VectorStore()
|
| 75 |
st.session_state.parser = PDFParser(debug=True)
|
| 76 |
+
st.success("Ключ API установлен")
|
| 77 |
except Exception as e:
|
| 78 |
+
st.error(f"Ошибка при инициализации систем: {e}")
|
| 79 |
else:
|
| 80 |
st.session_state.api_key_set = False
|
| 81 |
+
st.warning("Введите ключ API для продолжения")
|
| 82 |
|
| 83 |
st.divider()
|
| 84 |
|
| 85 |
st.subheader("Векторное хранилище")
|
| 86 |
+
|
| 87 |
if st.session_state.vector_store:
|
| 88 |
try:
|
| 89 |
info = st.session_state.vector_store.get_collection_info()
|
| 90 |
+
st.metric("Элементов в хранилище", info['count'])
|
| 91 |
+
st.caption(f"Путь: {info['persist_path']}")
|
| 92 |
except Exception as e:
|
| 93 |
+
st.error(f"Ошибка получения информации о хранилище: {e}")
|
| 94 |
else:
|
| 95 |
+
st.info("Установите ключ API для инициализации векторного хранилища")
|
| 96 |
|
| 97 |
st.divider()
|
| 98 |
|
| 99 |
+
st.subheader("Управление документами")
|
| 100 |
+
|
| 101 |
+
if st.button("Очистить векторное хранилище"):
|
| 102 |
if st.session_state.vector_store:
|
| 103 |
try:
|
| 104 |
st.session_state.vector_store.clear_all()
|
| 105 |
+
st.success("Векторное хранилище очищено")
|
| 106 |
except Exception as e:
|
| 107 |
+
st.error(f"Ошибка при очистке хранилища: {e}")
|
|
|
|
| 108 |
|
| 109 |
+
st.header("Загрузка PDF-документа")
|
|
|
|
| 110 |
|
| 111 |
uploaded_file = st.file_uploader(
|
| 112 |
+
"Выберите PDF-файл",
|
| 113 |
type=['pdf'],
|
| 114 |
+
help="PDF с текстом, изображениями и таблицами"
|
| 115 |
)
|
| 116 |
|
| 117 |
if uploaded_file is not None:
|
| 118 |
upload_path = Path(UPLOAD_FOLDER)
|
| 119 |
upload_path.mkdir(exist_ok=True)
|
|
|
|
| 120 |
file_path = upload_path / uploaded_file.name
|
| 121 |
with open(file_path, 'wb') as f:
|
| 122 |
f.write(uploaded_file.getbuffer())
|
| 123 |
+
st.success(f"Файл сохранён: {uploaded_file.name}")
|
|
|
|
| 124 |
|
| 125 |
if st.button("Распарсить PDF"):
|
| 126 |
if not st.session_state.api_key_set:
|
| 127 |
+
st.error("Введите ключ API для продолжения")
|
| 128 |
else:
|
| 129 |
try:
|
| 130 |
+
with st.spinner("Парсинг PDF..."):
|
| 131 |
+
print("PARSING: " + uploaded_file.name)
|
|
|
|
|
|
|
|
|
|
| 132 |
parser = st.session_state.parser
|
| 133 |
text, images, tables = parser.parse_pdf(str(file_path))
|
| 134 |
|
|
|
|
| 139 |
|
| 140 |
col1, col2, col3 = st.columns(3)
|
| 141 |
with col1:
|
| 142 |
+
st.metric("Текст", f"{len(text):,} символов")
|
| 143 |
with col2:
|
| 144 |
+
st.metric("Изображения", len(images))
|
| 145 |
with col3:
|
| 146 |
+
st.metric("Таблицы", len(tables))
|
| 147 |
|
| 148 |
+
# if images:
|
| 149 |
+
# st.subheader("Извлечённые изображения")
|
| 150 |
+
# for idx, img in enumerate(images):
|
| 151 |
+
# ocr_text = img.get('ocr_text', '')
|
| 152 |
+
# ocr_len = len(ocr_text)
|
| 153 |
+
# if ocr_len > 0:
|
| 154 |
+
# st.success(f"Изображение {idx}: {ocr_len} символов (OCR)")
|
| 155 |
+
# else:
|
| 156 |
+
# st.warning(f"Изображение {idx}: Текст OCR не найден (будет использоваться визуальный анализ)")
|
| 157 |
|
| 158 |
+
st.success("Парсинг PDF завершён!")
|
| 159 |
except Exception as e:
|
| 160 |
+
st.error(f"Ошибка при парсинге PDF: {e}")
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
st.divider()
|
|
|
|
| 163 |
|
| 164 |
+
st.header("Анализ")
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
|
| 167 |
+
if st.button("Анализировать"):
|
| 168 |
if not st.session_state.api_key_set:
|
| 169 |
+
st.error("Введите ключ API для продолжения")
|
| 170 |
elif st.session_state.current_text is None:
|
| 171 |
+
st.error("Распарсьте PDF-документ")
|
| 172 |
else:
|
| 173 |
try:
|
| 174 |
+
with st.spinner("Анализ изображений с помощью gpt-4o-mini..."):
|
| 175 |
+
print("ANALYSIS")
|
| 176 |
visual_rag = st.session_state.visual_rag_system
|
| 177 |
vector_store = st.session_state.vector_store
|
|
|
|
| 178 |
results = visual_rag.process_and_store_document(
|
| 179 |
text=st.session_state.current_text,
|
| 180 |
+
images=st.session_state.current_images,
|
| 181 |
tables=st.session_state.current_tables,
|
| 182 |
vector_store=vector_store,
|
| 183 |
doc_id=st.session_state.current_document or "current_doc"
|
|
|
|
| 185 |
|
| 186 |
st.session_state.processing_results = results
|
| 187 |
|
| 188 |
+
st.success("Анализ завершён и сохранён!")
|
| 189 |
|
| 190 |
col1, col2, col3 = st.columns(3)
|
| 191 |
with col1:
|
| 192 |
st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
|
| 193 |
with col2:
|
| 194 |
+
st.metric("Фрагментов текста", len(results['text_summaries']))
|
| 195 |
with col3:
|
| 196 |
st.metric("Проанализировано та��лиц", len(results['table_summaries']))
|
| 197 |
|
| 198 |
+
st.metric("Всего сохранено в вектор", results['total_stored'])
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
except Exception as e:
|
| 201 |
+
st.error(f"Ошибка при анализе: {e}")
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
st.divider()
|
|
|
|
| 204 |
|
| 205 |
+
st.header("Задать вопрос о документе")
|
|
|
|
| 206 |
|
| 207 |
if st.session_state.api_key_set and st.session_state.answering_rag is None:
|
|
|
|
| 208 |
st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
|
| 209 |
|
| 210 |
question = st.text_area(
|
| 211 |
+
"Введите ваш вопрос:",
|
| 212 |
height=100,
|
| 213 |
+
placeholder="О чем говорится в документе?"
|
| 214 |
)
|
| 215 |
|
| 216 |
+
if st.button("Поиск и генерация ответа"):
|
| 217 |
if not st.session_state.api_key_set:
|
| 218 |
+
st.error("Введите ключ API для продолжения")
|
| 219 |
elif st.session_state.current_text is None:
|
| 220 |
+
st.error("Распарсьте PDF-документ")
|
| 221 |
elif not question:
|
| 222 |
+
st.error("Введите вопрос")
|
| 223 |
else:
|
| 224 |
try:
|
| 225 |
+
with st.spinner("Поиск в документе и анализ..."):
|
| 226 |
+
print("QUESTION: " + question)
|
| 227 |
store = st.session_state.vector_store
|
| 228 |
|
| 229 |
doc_name = st.session_state.current_document or "current_doc"
|
|
|
|
| 232 |
'images': [],
|
| 233 |
'tables': []
|
| 234 |
}
|
| 235 |
+
|
| 236 |
store.add_documents(doc_data, doc_name)
|
| 237 |
|
| 238 |
search_results = store.search(question, n_results=5)
|
| 239 |
|
|
|
|
|
|
|
| 240 |
answering_rag = st.session_state.answering_rag
|
| 241 |
result = answering_rag.analyze_and_answer(question, search_results)
|
| 242 |
|
| 243 |
+
st.success("Анализ завершён!")
|
| 244 |
|
| 245 |
st.subheader("Ответ")
|
| 246 |
|
| 247 |
col1, col2, col3 = st.columns(3)
|
| 248 |
with col1:
|
| 249 |
+
confidence_map = {
|
| 250 |
+
'high': 'ВЫСОКАЯ',
|
| 251 |
+
'medium': 'СРЕДНЯЯ',
|
| 252 |
+
'low': 'НИЗКАЯ'
|
| 253 |
+
}
|
| 254 |
+
confidence_text = confidence_map.get(result['confidence'], result['confidence'].upper())
|
| 255 |
+
st.metric("Уверенность", confidence_text)
|
| 256 |
with col2:
|
| 257 |
st.metric("Использовано источников", result['sources_used'])
|
| 258 |
with col3:
|
| 259 |
if result['sources_used'] > 0:
|
| 260 |
+
st.metric("Сред. релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
|
| 261 |
|
| 262 |
st.write(result['answer'])
|
| 263 |
|
| 264 |
if st.checkbox("Показать исходные документы"):
|
| 265 |
+
st.subheader("Источники, использованные в ответе")
|
| 266 |
+
for source in result.get('formatted_sources', []):
|
| 267 |
relevance = source['relevance']
|
| 268 |
+
relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
|
|
|
|
| 269 |
with st.expander(
|
| 270 |
+
f"Источник {source['index']} - {source['type'].upper()} "
|
| 271 |
f"[{relevance_bar}] {relevance:.0%}"
|
| 272 |
):
|
| 273 |
st.write(source['content'])
|
|
|
|
|
|
|
|
|
|
| 274 |
except Exception as e:
|
| 275 |
+
st.error(f"Ошибка при обработке вопроса: {e}")
|
|
|
|
| 276 |
|
| 277 |
st.divider()
|
| 278 |
|
| 279 |
st.caption(
|
| 280 |
+
"Мультимодальная система RAG"
|
| 281 |
)
|
src/config.py
CHANGED
|
@@ -1,34 +1,38 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Конфигурационный файл
|
| 3 |
-
"""
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
|
|
| 8 |
OPENAI_MODEL = "gpt-4o-mini"
|
|
|
|
| 9 |
USE_CACHE = True
|
| 10 |
|
| 11 |
CHROMA_DB_PATH = "./chroma_db"
|
|
|
|
| 12 |
DOCSTORE_PATH = "./docstore"
|
|
|
|
| 13 |
PROCESSED_FILES_LOG = "./processed_files.txt"
|
| 14 |
|
| 15 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
|
|
|
| 16 |
EMBEDDING_DIM = 768
|
| 17 |
|
| 18 |
MAX_CHUNK_SIZE = 500
|
|
|
|
| 19 |
CHUNK_OVERLAP = 50
|
|
|
|
| 20 |
TEMPERATURE = 0.3
|
|
|
|
| 21 |
MAX_TOKENS = 500
|
| 22 |
|
| 23 |
LANGUAGE = "russian"
|
| 24 |
|
| 25 |
Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
|
|
|
|
| 26 |
Path(DOCSTORE_PATH).mkdir(exist_ok=True)
|
| 27 |
|
| 28 |
UPLOAD_FOLDER = "./uploaded_pdfs"
|
|
|
|
| 29 |
Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
|
| 30 |
-
MAX_PDF_SIZE_MB = 50
|
| 31 |
|
| 32 |
-
|
| 33 |
-
CACHE_RESPONSES = True
|
| 34 |
-
SUMMARIZE_FIRST = True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 5 |
+
|
| 6 |
OPENAI_MODEL = "gpt-4o-mini"
|
| 7 |
+
|
| 8 |
USE_CACHE = True
|
| 9 |
|
| 10 |
CHROMA_DB_PATH = "./chroma_db"
|
| 11 |
+
|
| 12 |
DOCSTORE_PATH = "./docstore"
|
| 13 |
+
|
| 14 |
PROCESSED_FILES_LOG = "./processed_files.txt"
|
| 15 |
|
| 16 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
| 17 |
+
|
| 18 |
EMBEDDING_DIM = 768
|
| 19 |
|
| 20 |
MAX_CHUNK_SIZE = 500
|
| 21 |
+
|
| 22 |
CHUNK_OVERLAP = 50
|
| 23 |
+
|
| 24 |
TEMPERATURE = 0.3
|
| 25 |
+
|
| 26 |
MAX_TOKENS = 500
|
| 27 |
|
| 28 |
LANGUAGE = "russian"
|
| 29 |
|
| 30 |
Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
|
| 31 |
+
|
| 32 |
Path(DOCSTORE_PATH).mkdir(exist_ok=True)
|
| 33 |
|
| 34 |
UPLOAD_FOLDER = "./uploaded_pdfs"
|
| 35 |
+
|
| 36 |
Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
|
|
|
|
| 37 |
|
| 38 |
+
MAX_PDF_SIZE_MB = 50
|
|
|
|
|
|
src/pdf_parser.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
"""
|
| 2 |
-
PDF Парсер
|
| 3 |
-
"""
|
| 4 |
import os
|
| 5 |
import json
|
| 6 |
import hashlib
|
|
@@ -12,34 +9,37 @@ from PIL import Image
|
|
| 12 |
import pytesseract
|
| 13 |
from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
|
| 14 |
|
| 15 |
-
|
| 16 |
class PDFParser:
|
| 17 |
def __init__(self, debug: bool = True):
|
| 18 |
self.docstore_path = Path(DOCSTORE_PATH)
|
| 19 |
self.docstore_path.mkdir(exist_ok=True)
|
| 20 |
self.processed_files = self._load_processed_files()
|
| 21 |
self.debug = debug
|
| 22 |
-
|
| 23 |
-
|
| 24 |
if self.debug:
|
| 25 |
print("PDFParser initialized")
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def _debug_print(self, label: str, data: any):
|
| 28 |
-
"""Debug"""
|
| 29 |
if self.debug:
|
| 30 |
-
print(f"
|
| 31 |
if isinstance(data, dict):
|
| 32 |
for key, val in data.items():
|
| 33 |
-
print(f"
|
| 34 |
elif isinstance(data, (list, tuple)):
|
| 35 |
-
print(f"
|
| 36 |
for i, item in enumerate(data[:3]):
|
| 37 |
-
print(f"
|
| 38 |
else:
|
| 39 |
-
print(f"
|
| 40 |
|
| 41 |
def _load_processed_files(self) -> Dict[str, str]:
|
| 42 |
-
"""Подгрузка обработанных файлов"""
|
| 43 |
if os.path.exists(PROCESSED_FILES_LOG):
|
| 44 |
try:
|
| 45 |
with open(PROCESSED_FILES_LOG, 'r') as f:
|
|
@@ -49,12 +49,10 @@ class PDFParser:
|
|
| 49 |
return {}
|
| 50 |
|
| 51 |
def _save_processed_files(self):
|
| 52 |
-
"""Сохранение обработанных файлов"""
|
| 53 |
with open(PROCESSED_FILES_LOG, 'w') as f:
|
| 54 |
json.dump(self.processed_files, f, indent=2)
|
| 55 |
|
| 56 |
def _get_file_hash(self, file_path: str) -> str:
|
| 57 |
-
"""Проверка изменения файлов"""
|
| 58 |
hash_md5 = hashlib.md5()
|
| 59 |
with open(file_path, "rb") as f:
|
| 60 |
for chunk in iter(lambda: f.read(4096), b""):
|
|
@@ -62,56 +60,43 @@ class PDFParser:
|
|
| 62 |
return hash_md5.hexdigest()
|
| 63 |
|
| 64 |
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
| 65 |
-
"""Извлечение текста из PDF"""
|
| 66 |
text = ""
|
| 67 |
try:
|
| 68 |
with open(pdf_path, 'rb') as file:
|
| 69 |
reader = PyPDF2.PdfReader(file)
|
| 70 |
page_count = len(reader.pages)
|
| 71 |
self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
|
| 72 |
-
|
| 73 |
for page_num, page in enumerate(reader.pages):
|
| 74 |
page_text = page.extract_text()
|
| 75 |
text += page_text + "\n"
|
| 76 |
self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
|
| 77 |
except Exception as e:
|
| 78 |
self._debug_print("ERROR extracting text", str(e))
|
| 79 |
-
|
| 80 |
self._debug_print("Total Text Extracted", len(text))
|
| 81 |
return text
|
| 82 |
|
| 83 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 84 |
-
"""Извлечение изображений из PDF"""
|
| 85 |
images_data = []
|
| 86 |
try:
|
| 87 |
-
self._debug_print("Image
|
| 88 |
-
|
| 89 |
images = convert_from_path(pdf_path, dpi=150)
|
| 90 |
-
self._debug_print(f"Total images: {len(images)}")
|
| 91 |
-
|
| 92 |
for idx, image in enumerate(images):
|
| 93 |
-
self._debug_print(f"Image {idx}", f"Size: {image.size}")
|
| 94 |
-
|
| 95 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 96 |
image.save(image_path)
|
| 97 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 98 |
-
|
| 99 |
-
self._debug_print(f"Image {idx} OCR", "Running OCR...")
|
| 100 |
-
|
| 101 |
try:
|
| 102 |
ocr_text = pytesseract.image_to_string(image, lang='rus')
|
| 103 |
-
|
| 104 |
ocr_text = ocr_text.strip()
|
| 105 |
-
|
| 106 |
if not ocr_text or len(ocr_text) < 5:
|
| 107 |
-
self._debug_print(f"Image {idx} OCR Result", f"
|
| 108 |
else:
|
| 109 |
-
self._debug_print(f"Image {idx} OCR Result", f"
|
| 110 |
-
|
| 111 |
except Exception as ocr_error:
|
| 112 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
| 113 |
-
ocr_text = f"
|
| 114 |
-
|
| 115 |
images_data.append({
|
| 116 |
'page': idx,
|
| 117 |
'path': str(image_path),
|
|
@@ -120,19 +105,15 @@ class PDFParser:
|
|
| 120 |
})
|
| 121 |
except Exception as e:
|
| 122 |
self._debug_print("ERROR extracting images", str(e))
|
| 123 |
-
|
| 124 |
self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
|
| 125 |
return images_data
|
| 126 |
|
| 127 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 128 |
-
"""Извлечение таблиц из PDF"""
|
| 129 |
tables_data = []
|
| 130 |
try:
|
| 131 |
text = self._extract_text_from_pdf(pdf_path)
|
| 132 |
lines = text.split('\n')
|
| 133 |
-
|
| 134 |
-
self._debug_print("Table extraction", f"Scanning {len(lines)} lines")
|
| 135 |
-
|
| 136 |
current_table = []
|
| 137 |
for line in lines:
|
| 138 |
if '|' in line or '\t' in line:
|
|
@@ -144,53 +125,40 @@ class PDFParser:
|
|
| 144 |
'description': f"Table {len(tables_data) + 1}"
|
| 145 |
})
|
| 146 |
current_table = []
|
| 147 |
-
|
| 148 |
if current_table and len(current_table) > 1:
|
| 149 |
tables_data.append({
|
| 150 |
'content': '\n'.join(current_table),
|
| 151 |
'description': f"Table {len(tables_data) + 1}"
|
| 152 |
})
|
| 153 |
-
|
| 154 |
self._debug_print("Tables Found", len(tables_data))
|
| 155 |
except Exception as e:
|
| 156 |
self._debug_print("ERROR extracting tables", str(e))
|
| 157 |
-
|
| 158 |
return tables_data
|
| 159 |
|
| 160 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 161 |
-
"""Парсинг PDF"""
|
| 162 |
file_hash = self._get_file_hash(pdf_path)
|
| 163 |
doc_id = Path(pdf_path).stem
|
| 164 |
-
|
| 165 |
self._debug_print("PDF Parsing Started", f"File: {doc_id}")
|
| 166 |
-
|
| 167 |
if doc_id in self.processed_files:
|
| 168 |
if self.processed_files[doc_id] == file_hash:
|
| 169 |
self._debug_print("Status", f"File {doc_id} already processed")
|
| 170 |
return self._load_extracted_data(doc_id)
|
| 171 |
-
|
| 172 |
-
print(f"\nProcessing PDF: {doc_id}")
|
| 173 |
-
|
| 174 |
text = self._extract_text_from_pdf(pdf_path)
|
| 175 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 176 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 177 |
-
|
| 178 |
-
self._debug_print("Summary", {
|
| 179 |
'text_length': len(text),
|
| 180 |
'images_count': len(images),
|
| 181 |
'tables_count': len(tables),
|
| 182 |
'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
|
| 183 |
})
|
| 184 |
-
|
| 185 |
self._save_extracted_data(doc_id, text, images, tables)
|
| 186 |
-
|
| 187 |
self.processed_files[doc_id] = file_hash
|
| 188 |
self._save_processed_files()
|
| 189 |
-
|
| 190 |
return text, images, tables
|
| 191 |
|
| 192 |
def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
|
| 193 |
-
"""Сохранение извелеченных данных в Docstore"""
|
| 194 |
data = {
|
| 195 |
'text': text,
|
| 196 |
'images': images,
|
|
@@ -199,27 +167,13 @@ class PDFParser:
|
|
| 199 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 200 |
with open(data_path, 'w', encoding='utf-8') as f:
|
| 201 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 202 |
-
|
| 203 |
self._debug_print("Data Saved", str(data_path))
|
| 204 |
|
| 205 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 206 |
-
"""Подгрузка ранее извлеченных данных из Docstore"""
|
| 207 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 208 |
try:
|
| 209 |
with open(data_path, 'r', encoding='utf-8') as f:
|
| 210 |
data = json.load(f)
|
| 211 |
return data['text'], data['images'], data['tables']
|
| 212 |
except:
|
| 213 |
-
return "", [], []
|
| 214 |
-
|
| 215 |
-
def get_all_documents(self) -> Dict:
|
| 216 |
-
"""Получение всех документов из Docstore"""
|
| 217 |
-
all_docs = {}
|
| 218 |
-
for json_file in self.docstore_path.glob("*_data.json"):
|
| 219 |
-
doc_id = json_file.stem.replace("_data", "")
|
| 220 |
-
try:
|
| 221 |
-
with open(json_file, 'r', encoding='utf-8') as f:
|
| 222 |
-
all_docs[doc_id] = json.load(f)
|
| 223 |
-
except:
|
| 224 |
-
pass
|
| 225 |
-
return all_docs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import hashlib
|
|
|
|
| 9 |
import pytesseract
|
| 10 |
from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
|
| 11 |
|
|
|
|
| 12 |
class PDFParser:
|
| 13 |
def __init__(self, debug: bool = True):
|
| 14 |
self.docstore_path = Path(DOCSTORE_PATH)
|
| 15 |
self.docstore_path.mkdir(exist_ok=True)
|
| 16 |
self.processed_files = self._load_processed_files()
|
| 17 |
self.debug = debug
|
| 18 |
+
self._configure_tesseract()
|
|
|
|
| 19 |
if self.debug:
|
| 20 |
print("PDFParser initialized")
|
| 21 |
|
| 22 |
+
def _configure_tesseract(self):
|
| 23 |
+
try:
|
| 24 |
+
pytesseract.get_tesseract_version()
|
| 25 |
+
print("Tesseract configured successfully")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"Tesseract configuration warning: {e}")
|
| 28 |
+
|
| 29 |
def _debug_print(self, label: str, data: any):
|
|
|
|
| 30 |
if self.debug:
|
| 31 |
+
print(f"[PDF Parser] {label}")
|
| 32 |
if isinstance(data, dict):
|
| 33 |
for key, val in data.items():
|
| 34 |
+
print(f" {key}: {val}")
|
| 35 |
elif isinstance(data, (list, tuple)):
|
| 36 |
+
print(f" Count: {len(data)}")
|
| 37 |
for i, item in enumerate(data[:3]):
|
| 38 |
+
print(f" [{i}]: {str(item)[:100]}")
|
| 39 |
else:
|
| 40 |
+
print(f" {data}")
|
| 41 |
|
| 42 |
def _load_processed_files(self) -> Dict[str, str]:
|
|
|
|
| 43 |
if os.path.exists(PROCESSED_FILES_LOG):
|
| 44 |
try:
|
| 45 |
with open(PROCESSED_FILES_LOG, 'r') as f:
|
|
|
|
| 49 |
return {}
|
| 50 |
|
| 51 |
def _save_processed_files(self):
|
|
|
|
| 52 |
with open(PROCESSED_FILES_LOG, 'w') as f:
|
| 53 |
json.dump(self.processed_files, f, indent=2)
|
| 54 |
|
| 55 |
def _get_file_hash(self, file_path: str) -> str:
|
|
|
|
| 56 |
hash_md5 = hashlib.md5()
|
| 57 |
with open(file_path, "rb") as f:
|
| 58 |
for chunk in iter(lambda: f.read(4096), b""):
|
|
|
|
| 60 |
return hash_md5.hexdigest()
|
| 61 |
|
| 62 |
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
|
|
|
| 63 |
text = ""
|
| 64 |
try:
|
| 65 |
with open(pdf_path, 'rb') as file:
|
| 66 |
reader = PyPDF2.PdfReader(file)
|
| 67 |
page_count = len(reader.pages)
|
| 68 |
self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
|
|
|
|
| 69 |
for page_num, page in enumerate(reader.pages):
|
| 70 |
page_text = page.extract_text()
|
| 71 |
text += page_text + "\n"
|
| 72 |
self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
|
| 73 |
except Exception as e:
|
| 74 |
self._debug_print("ERROR extracting text", str(e))
|
|
|
|
| 75 |
self._debug_print("Total Text Extracted", len(text))
|
| 76 |
return text
|
| 77 |
|
| 78 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
|
|
|
| 79 |
images_data = []
|
| 80 |
try:
|
| 81 |
+
self._debug_print("Image Extraction Started", f"File: {pdf_path}")
|
|
|
|
| 82 |
images = convert_from_path(pdf_path, dpi=150)
|
| 83 |
+
self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
|
|
|
|
| 84 |
for idx, image in enumerate(images):
|
| 85 |
+
self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
|
|
|
|
| 86 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 87 |
image.save(image_path)
|
| 88 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 89 |
+
self._debug_print(f"Image {idx} OCR")
|
|
|
|
|
|
|
| 90 |
try:
|
| 91 |
ocr_text = pytesseract.image_to_string(image, lang='rus')
|
|
|
|
| 92 |
ocr_text = ocr_text.strip()
|
|
|
|
| 93 |
if not ocr_text or len(ocr_text) < 5:
|
| 94 |
+
self._debug_print(f"Image {idx} OCR Result", f"EMPTY ({len(ocr_text)} chars)")
|
| 95 |
else:
|
| 96 |
+
self._debug_print(f"Image {idx} OCR Result", f"Success - {len(ocr_text)} chars: {ocr_text[:150]}")
|
|
|
|
| 97 |
except Exception as ocr_error:
|
| 98 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
| 99 |
+
ocr_text = f"Image {idx}: OCR failed - {str(ocr_error)}"
|
|
|
|
| 100 |
images_data.append({
|
| 101 |
'page': idx,
|
| 102 |
'path': str(image_path),
|
|
|
|
| 105 |
})
|
| 106 |
except Exception as e:
|
| 107 |
self._debug_print("ERROR extracting images", str(e))
|
|
|
|
| 108 |
self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
|
| 109 |
return images_data
|
| 110 |
|
| 111 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
|
|
|
| 112 |
tables_data = []
|
| 113 |
try:
|
| 114 |
text = self._extract_text_from_pdf(pdf_path)
|
| 115 |
lines = text.split('\n')
|
| 116 |
+
self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
|
|
|
|
|
|
|
| 117 |
current_table = []
|
| 118 |
for line in lines:
|
| 119 |
if '|' in line or '\t' in line:
|
|
|
|
| 125 |
'description': f"Table {len(tables_data) + 1}"
|
| 126 |
})
|
| 127 |
current_table = []
|
|
|
|
| 128 |
if current_table and len(current_table) > 1:
|
| 129 |
tables_data.append({
|
| 130 |
'content': '\n'.join(current_table),
|
| 131 |
'description': f"Table {len(tables_data) + 1}"
|
| 132 |
})
|
|
|
|
| 133 |
self._debug_print("Tables Found", len(tables_data))
|
| 134 |
except Exception as e:
|
| 135 |
self._debug_print("ERROR extracting tables", str(e))
|
|
|
|
| 136 |
return tables_data
|
| 137 |
|
| 138 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
|
|
|
| 139 |
file_hash = self._get_file_hash(pdf_path)
|
| 140 |
doc_id = Path(pdf_path).stem
|
|
|
|
| 141 |
self._debug_print("PDF Parsing Started", f"File: {doc_id}")
|
|
|
|
| 142 |
if doc_id in self.processed_files:
|
| 143 |
if self.processed_files[doc_id] == file_hash:
|
| 144 |
self._debug_print("Status", f"File {doc_id} already processed")
|
| 145 |
return self._load_extracted_data(doc_id)
|
| 146 |
+
print(f"Processing PDF: {doc_id}")
|
|
|
|
|
|
|
| 147 |
text = self._extract_text_from_pdf(pdf_path)
|
| 148 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 149 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 150 |
+
self._debug_print("Extraction Summary", {
|
|
|
|
| 151 |
'text_length': len(text),
|
| 152 |
'images_count': len(images),
|
| 153 |
'tables_count': len(tables),
|
| 154 |
'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
|
| 155 |
})
|
|
|
|
| 156 |
self._save_extracted_data(doc_id, text, images, tables)
|
|
|
|
| 157 |
self.processed_files[doc_id] = file_hash
|
| 158 |
self._save_processed_files()
|
|
|
|
| 159 |
return text, images, tables
|
| 160 |
|
| 161 |
def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
|
|
|
|
| 162 |
data = {
|
| 163 |
'text': text,
|
| 164 |
'images': images,
|
|
|
|
| 167 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 168 |
with open(data_path, 'w', encoding='utf-8') as f:
|
| 169 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
| 170 |
self._debug_print("Data Saved", str(data_path))
|
| 171 |
|
| 172 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
|
|
|
| 173 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 174 |
try:
|
| 175 |
with open(data_path, 'r', encoding='utf-8') as f:
|
| 176 |
data = json.load(f)
|
| 177 |
return data['text'], data['images'], data['tables']
|
| 178 |
except:
|
| 179 |
+
return "", [], []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/rag_system.py
CHANGED
|
@@ -1,55 +1,38 @@
|
|
| 1 |
-
"""
|
| 2 |
-
RAG основной pipeline
|
| 3 |
-
"""
|
| 4 |
from typing import List, Dict
|
| 5 |
from langchain_openai import ChatOpenAI
|
| 6 |
-
from langchain_core.messages import HumanMessage
|
| 7 |
import base64
|
| 8 |
import os
|
| 9 |
from pathlib import Path
|
| 10 |
from config import (
|
| 11 |
-
OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
|
| 12 |
LANGUAGE, CHROMA_DB_PATH
|
| 13 |
)
|
| 14 |
|
| 15 |
-
|
| 16 |
class VisualMultimodalRAG:
|
| 17 |
-
"""
|
| 18 |
-
RAG - подготовительный этап:
|
| 19 |
-
1. Кодирует изображение в base64 и отправляет в gpt-4o-mini
|
| 20 |
-
2. Получает описание изображения
|
| 21 |
-
3. Сохраняет описание в векторное хранилище
|
| 22 |
-
"""
|
| 23 |
-
|
| 24 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 25 |
api_key = api_key or OPENAI_API_KEY
|
| 26 |
self.debug = debug
|
| 27 |
-
|
| 28 |
self.llm = ChatOpenAI(
|
| 29 |
-
model_name=
|
| 30 |
api_key=api_key,
|
| 31 |
temperature=TEMPERATURE,
|
| 32 |
max_tokens=MAX_TOKENS,
|
| 33 |
)
|
| 34 |
-
|
| 35 |
self.language = LANGUAGE
|
| 36 |
-
self.visual_summaries_log = []
|
| 37 |
-
|
| 38 |
if self.debug:
|
| 39 |
-
print(
|
| 40 |
|
| 41 |
def _debug_print(self, label: str, data: any):
|
| 42 |
-
"""Debug"""
|
| 43 |
if self.debug:
|
| 44 |
-
print(f"
|
| 45 |
if isinstance(data, (list, dict)):
|
| 46 |
-
print(f"
|
| 47 |
-
print(f"
|
| 48 |
else:
|
| 49 |
-
print(f"
|
| 50 |
|
| 51 |
def _image_to_base64(self, image_path: str) -> str:
|
| 52 |
-
"""Конвертирует изображение в base64"""
|
| 53 |
try:
|
| 54 |
with open(image_path, 'rb') as image_file:
|
| 55 |
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|
@@ -59,17 +42,12 @@ class VisualMultimodalRAG:
|
|
| 59 |
return None
|
| 60 |
|
| 61 |
def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
|
| 62 |
-
"""
|
| 63 |
-
Отправляет в модель изображение для суммаризации
|
| 64 |
-
"""
|
| 65 |
if not os.path.exists(image_path):
|
| 66 |
-
return f"
|
| 67 |
-
|
| 68 |
try:
|
| 69 |
image_base64 = self._image_to_base64(image_path)
|
| 70 |
if not image_base64:
|
| 71 |
-
return f"
|
| 72 |
-
|
| 73 |
file_ext = Path(image_path).suffix.lower()
|
| 74 |
media_type_map = {
|
| 75 |
'.jpg': 'image/jpeg',
|
|
@@ -79,9 +57,7 @@ class VisualMultimodalRAG:
|
|
| 79 |
'.webp': 'image/webp'
|
| 80 |
}
|
| 81 |
media_type = media_type_map.get(file_ext, 'image/png')
|
| 82 |
-
|
| 83 |
-
print(f" Analyzing image {image_idx}...")
|
| 84 |
-
|
| 85 |
message = HumanMessage(
|
| 86 |
content=[
|
| 87 |
{
|
|
@@ -92,86 +68,62 @@ class VisualMultimodalRAG:
|
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"type": "text",
|
| 95 |
-
"text": f"""
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
Результат:"""
|
| 106 |
}
|
| 107 |
],
|
| 108 |
)
|
| 109 |
-
|
| 110 |
response = self.llm.invoke([message])
|
| 111 |
analysis = response.content.strip()
|
| 112 |
-
|
| 113 |
if self.debug:
|
| 114 |
self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
|
| 115 |
-
|
| 116 |
-
print(f" Image {image_idx} analyzed successfully")
|
| 117 |
return analysis
|
| 118 |
-
|
| 119 |
except Exception as e:
|
| 120 |
-
error_msg = f"
|
| 121 |
-
print(f"
|
| 122 |
return error_msg
|
| 123 |
|
| 124 |
def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
|
| 125 |
-
"""
|
| 126 |
-
Считывает изображения и отправляет на анализ
|
| 127 |
-
"""
|
| 128 |
visual_analyses = []
|
| 129 |
-
|
| 130 |
for idx, image in enumerate(images):
|
| 131 |
image_path = image.get('path', '')
|
| 132 |
-
|
| 133 |
if not image_path:
|
| 134 |
-
print(f"
|
| 135 |
continue
|
| 136 |
-
|
| 137 |
visual_analysis = self.analyze_image_visually(image_path, idx)
|
| 138 |
-
|
| 139 |
visual_analyses.append({
|
| 140 |
'type': 'image_visual',
|
| 141 |
'image_index': idx,
|
| 142 |
'image_path': image_path,
|
| 143 |
'visual_analysis': visual_analysis,
|
| 144 |
-
'ocr_text': image.get('ocr_text', '')
|
| 145 |
})
|
| 146 |
-
|
| 147 |
return visual_analyses
|
| 148 |
|
| 149 |
def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
|
| 150 |
-
"""
|
| 151 |
-
Отправляет куски текста на суммаризацию
|
| 152 |
-
"""
|
| 153 |
chunks = []
|
| 154 |
text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
|
| 155 |
-
|
| 156 |
self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
|
| 157 |
-
|
| 158 |
for idx, chunk in enumerate(text_chunks):
|
| 159 |
if len(chunk.strip()) < 50:
|
| 160 |
continue
|
| 161 |
-
|
| 162 |
try:
|
| 163 |
-
prompt = f"""
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
Текст :
|
| 167 |
{chunk}
|
| 168 |
-
|
| 169 |
-
Результат:"""
|
| 170 |
-
|
| 171 |
message = HumanMessage(content=prompt)
|
| 172 |
response = self.llm.invoke([message])
|
| 173 |
summary = response.content.strip()
|
| 174 |
-
|
| 175 |
chunks.append({
|
| 176 |
'type': 'text_chunk',
|
| 177 |
'chunk_index': len(chunks),
|
|
@@ -179,40 +131,27 @@ class VisualMultimodalRAG:
|
|
| 179 |
'summary': summary,
|
| 180 |
'chunk_length': len(chunk)
|
| 181 |
})
|
| 182 |
-
|
| 183 |
if self.debug:
|
| 184 |
self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
|
| 185 |
-
|
| 186 |
except Exception as e:
|
| 187 |
print(f"Error summarizing text chunk: {e}")
|
| 188 |
-
|
| 189 |
return chunks
|
| 190 |
|
| 191 |
def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
|
| 192 |
-
"""
|
| 193 |
-
Отправляет таблицы на суммаризацию
|
| 194 |
-
"""
|
| 195 |
summaries = []
|
| 196 |
-
|
| 197 |
for idx, table in enumerate(tables):
|
| 198 |
table_content = table.get('content', '')
|
| 199 |
-
|
| 200 |
if not table_content or len(table_content.strip()) < 10:
|
| 201 |
continue
|
| 202 |
-
|
| 203 |
try:
|
| 204 |
-
prompt = f"""
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
Таблица:
|
| 208 |
{table_content}
|
| 209 |
-
|
| 210 |
-
Результат:"""
|
| 211 |
-
|
| 212 |
message = HumanMessage(content=prompt)
|
| 213 |
response = self.llm.invoke([message])
|
| 214 |
summary = response.content.strip()
|
| 215 |
-
|
| 216 |
summaries.append({
|
| 217 |
'type': 'table',
|
| 218 |
'table_index': idx,
|
|
@@ -220,29 +159,21 @@ class VisualMultimodalRAG:
|
|
| 220 |
'summary': summary,
|
| 221 |
'table_length': len(table_content)
|
| 222 |
})
|
| 223 |
-
|
| 224 |
if self.debug:
|
| 225 |
self._debug_print(f"Table {idx} Summary", summary)
|
| 226 |
-
|
| 227 |
except Exception as e:
|
| 228 |
print(f"Error summarizing table {idx}: {e}")
|
| 229 |
-
|
| 230 |
return summaries
|
| 231 |
|
| 232 |
def process_and_store_document(
|
| 233 |
-
self,
|
| 234 |
-
text: str,
|
| 235 |
images: List[Dict],
|
| 236 |
tables: List[Dict],
|
| 237 |
vector_store,
|
| 238 |
doc_id: str
|
| 239 |
) -> Dict:
|
| 240 |
-
""
|
| 241 |
-
Основной pipeline анализирует и сохраняет документы в хранилище
|
| 242 |
-
"""
|
| 243 |
-
|
| 244 |
-
print(f"PROCESSING ANALYSIS: {doc_id}")
|
| 245 |
-
|
| 246 |
results = {
|
| 247 |
'doc_id': doc_id,
|
| 248 |
'image_visual_analyses': [],
|
|
@@ -250,53 +181,42 @@ class VisualMultimodalRAG:
|
|
| 250 |
'table_summaries': [],
|
| 251 |
'total_stored': 0
|
| 252 |
}
|
| 253 |
-
|
| 254 |
-
print(f"\n VISUAL IMAGE ANALYSIS ({len(images)} )")
|
| 255 |
-
|
| 256 |
-
|
| 257 |
image_analyses = self.analyze_images_visually(images)
|
| 258 |
results['image_visual_analyses'] = image_analyses
|
| 259 |
-
|
| 260 |
image_docs = {
|
| 261 |
'text': ' | '.join([
|
| 262 |
-
f"Image {a['image_index']}: {a['visual_analysis']}"
|
| 263 |
for a in image_analyses
|
| 264 |
]),
|
| 265 |
'images': [],
|
| 266 |
'tables': []
|
| 267 |
}
|
| 268 |
-
|
| 269 |
for analysis in image_analyses:
|
| 270 |
-
print(f"
|
| 271 |
-
print(f"
|
| 272 |
-
print(f"
|
| 273 |
-
|
| 274 |
if image_analyses:
|
| 275 |
try:
|
| 276 |
vector_store.add_documents(
|
| 277 |
-
image_docs,
|
| 278 |
f"{doc_id}_images_visual"
|
| 279 |
)
|
| 280 |
results['total_stored'] += len(image_analyses)
|
| 281 |
-
print(f"
|
| 282 |
except Exception as e:
|
| 283 |
print(f"Error storing image analyses: {e}")
|
| 284 |
-
|
| 285 |
-
print(f"\n TEXT CHUNK SUMMARIZATION")
|
| 286 |
-
|
| 287 |
text_summaries = self.summarize_text_chunks(text)
|
| 288 |
results['text_summaries'] = text_summaries
|
| 289 |
-
|
| 290 |
text_docs = {
|
| 291 |
-
'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
|
| 292 |
-
|
| 293 |
'images': [],
|
| 294 |
'tables': []
|
| 295 |
}
|
| 296 |
-
|
| 297 |
for summary in text_summaries:
|
| 298 |
-
print(f"
|
| 299 |
-
|
| 300 |
if text_summaries:
|
| 301 |
try:
|
| 302 |
vector_store.add_documents(
|
|
@@ -304,25 +224,20 @@ class VisualMultimodalRAG:
|
|
| 304 |
f"{doc_id}_text_chunks"
|
| 305 |
)
|
| 306 |
results['total_stored'] += len(text_summaries)
|
| 307 |
-
print(f"
|
| 308 |
except Exception as e:
|
| 309 |
-
print(f"
|
| 310 |
-
|
| 311 |
-
print(f"\n TABLE SUMMARIZATION ({len(tables)}")
|
| 312 |
-
|
| 313 |
table_summaries = self.summarize_tables(tables)
|
| 314 |
results['table_summaries'] = table_summaries
|
| 315 |
-
|
| 316 |
table_docs = {
|
| 317 |
-
'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
|
| 318 |
-
|
| 319 |
'images': [],
|
| 320 |
'tables': []
|
| 321 |
}
|
| 322 |
-
|
| 323 |
for summary in table_summaries:
|
| 324 |
-
print(f"
|
| 325 |
-
|
| 326 |
if table_summaries:
|
| 327 |
try:
|
| 328 |
vector_store.add_documents(
|
|
@@ -330,17 +245,14 @@ class VisualMultimodalRAG:
|
|
| 330 |
f"{doc_id}_tables"
|
| 331 |
)
|
| 332 |
results['total_stored'] += len(table_summaries)
|
| 333 |
-
print(f"
|
| 334 |
except Exception as e:
|
| 335 |
-
print(f"
|
| 336 |
-
|
| 337 |
-
print(f"
|
| 338 |
-
print(f"
|
| 339 |
-
print(f"
|
| 340 |
-
print(f"
|
| 341 |
-
print(f" Total items stored in vector: {results['total_stored']}")
|
| 342 |
-
|
| 343 |
-
self.visual_summaries_log.append(results)
|
| 344 |
return results
|
| 345 |
|
| 346 |
def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
|
|
@@ -352,73 +264,41 @@ class VisualMultimodalRAG:
|
|
| 352 |
start = end - overlap
|
| 353 |
return chunks
|
| 354 |
|
| 355 |
-
def get_visual_summaries_log(self) -> List[Dict]:
|
| 356 |
-
return self.visual_summaries_log
|
| 357 |
-
|
| 358 |
-
|
| 359 |
class AnsweringRAG:
|
| 360 |
-
"""
|
| 361 |
-
RAG - работа с ответом на запрос:
|
| 362 |
-
1. Поиск в векторном хранилище
|
| 363 |
-
2. Анализ результатов
|
| 364 |
-
3. Предоставление ответа
|
| 365 |
-
"""
|
| 366 |
-
|
| 367 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 368 |
api_key = api_key or OPENAI_API_KEY
|
| 369 |
self.debug = debug
|
| 370 |
-
|
| 371 |
self.llm = ChatOpenAI(
|
| 372 |
-
model_name=
|
| 373 |
api_key=api_key,
|
| 374 |
temperature=TEMPERATURE,
|
| 375 |
max_tokens=MAX_TOKENS,
|
| 376 |
)
|
| 377 |
-
|
| 378 |
self.language = LANGUAGE
|
| 379 |
-
self.answer_log = []
|
| 380 |
-
|
| 381 |
if self.debug:
|
| 382 |
-
print("
|
| 383 |
|
| 384 |
def _debug_print(self, label: str, data: any):
|
| 385 |
-
"""Debug"""
|
| 386 |
if self.debug:
|
| 387 |
-
print(f"
|
| 388 |
if isinstance(data, (list, dict)):
|
| 389 |
-
print(f"
|
| 390 |
-
print(f"
|
| 391 |
else:
|
| 392 |
-
print(f"
|
| 393 |
|
| 394 |
def analyze_and_answer(
|
| 395 |
-
self,
|
| 396 |
-
question: str,
|
| 397 |
search_results: List[Dict]
|
| 398 |
) -> Dict:
|
| 399 |
-
""
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
Ответ:
|
| 403 |
-
{
|
| 404 |
-
'question': user question,
|
| 405 |
-
'answer': detailed answer,
|
| 406 |
-
'sources_used': number of sources,
|
| 407 |
-
'confidence': low/medium/high,
|
| 408 |
-
'search_results': original search results
|
| 409 |
-
}
|
| 410 |
-
"""
|
| 411 |
-
|
| 412 |
-
print(f"ANALYZING QUESTION & GENERATING ANSWER")
|
| 413 |
-
|
| 414 |
-
print(f"\n Question: {question}")
|
| 415 |
-
print(f" Search Results: {len(search_results)}")
|
| 416 |
-
|
| 417 |
if not search_results:
|
| 418 |
-
print(
|
| 419 |
-
answer = f"""
|
| 420 |
"""
|
| 421 |
-
|
| 422 |
result = {
|
| 423 |
'question': question,
|
| 424 |
'answer': answer,
|
|
@@ -426,72 +306,54 @@ class AnsweringRAG:
|
|
| 426 |
'confidence': 'low',
|
| 427 |
'search_results': []
|
| 428 |
}
|
| 429 |
-
self.answer_log.append(result)
|
| 430 |
return result
|
| 431 |
-
|
| 432 |
context_parts = []
|
| 433 |
for idx, result in enumerate(search_results, 1):
|
| 434 |
content = result.get('content', '')
|
| 435 |
-
metadata = result.get('metadata', {})
|
| 436 |
content_type = result.get('type', 'unknown')
|
| 437 |
distance = result.get('distance', 0)
|
| 438 |
relevance = 1 - distance if distance else 0
|
| 439 |
-
|
| 440 |
context_parts.append(f"""
|
| 441 |
[Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
|
| 442 |
{content}""")
|
| 443 |
-
|
| 444 |
full_context = "\n".join(context_parts)
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
analysis_prompt = f"""Ты - ассистент по анализу документов и ответов на вопросы по ним.
|
| 449 |
-
|
| 450 |
-
ВОПРОС:
|
| 451 |
"{question}"
|
| 452 |
-
|
| 453 |
-
РЕЛЕВАНТНАЯ ИНФОРМАЦИЯ:
|
| 454 |
{full_context}
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
print(f"
|
| 466 |
-
print(f" Context size: {len(full_context)} chars")
|
| 467 |
-
print(f" Sources: {len(search_results)}")
|
| 468 |
-
|
| 469 |
try:
|
| 470 |
message = HumanMessage(content=analysis_prompt)
|
| 471 |
response = self.llm.invoke([message])
|
| 472 |
answer = response.content.strip()
|
| 473 |
-
|
| 474 |
confidence = self._estimate_confidence(len(search_results), answer)
|
| 475 |
-
|
| 476 |
-
print(f"
|
| 477 |
-
print(f"
|
| 478 |
-
print(f" Answer length: {len(answer)} chars")
|
| 479 |
-
|
| 480 |
result = {
|
| 481 |
'question': question,
|
| 482 |
'answer': answer,
|
| 483 |
'sources_used': len(search_results),
|
| 484 |
'confidence': confidence,
|
| 485 |
-
'search_results': search_results
|
|
|
|
| 486 |
}
|
| 487 |
-
|
| 488 |
-
self.answer_log.append(result)
|
| 489 |
return result
|
| 490 |
-
|
| 491 |
except Exception as e:
|
| 492 |
-
print(f"
|
| 493 |
-
answer =
|
| 494 |
-
|
| 495 |
result = {
|
| 496 |
'question': question,
|
| 497 |
'answer': answer,
|
|
@@ -500,19 +362,24 @@ class AnsweringRAG:
|
|
| 500 |
'error': str(e),
|
| 501 |
'search_results': search_results
|
| 502 |
}
|
| 503 |
-
|
| 504 |
-
self.answer_log.append(result)
|
| 505 |
return result
|
| 506 |
|
| 507 |
def _estimate_confidence(self, sources_count: int, answer: str) -> str:
|
| 508 |
-
"""Уверенность в ответе на основании найденных источников информации"""
|
| 509 |
answer_length = len(answer)
|
| 510 |
-
|
| 511 |
if sources_count >= 3 and answer_length > 500:
|
| 512 |
return "high"
|
| 513 |
-
|
| 514 |
elif sources_count >= 2 and answer_length > 200:
|
| 515 |
return "medium"
|
| 516 |
-
|
| 517 |
else:
|
| 518 |
return "low"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import List, Dict
|
| 2 |
from langchain_openai import ChatOpenAI
|
| 3 |
+
from langchain_core.messages import HumanMessage
|
| 4 |
import base64
|
| 5 |
import os
|
| 6 |
from pathlib import Path
|
| 7 |
from config import (
|
| 8 |
+
OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
|
| 9 |
LANGUAGE, CHROMA_DB_PATH
|
| 10 |
)
|
| 11 |
|
|
|
|
| 12 |
class VisualMultimodalRAG:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 14 |
api_key = api_key or OPENAI_API_KEY
|
| 15 |
self.debug = debug
|
|
|
|
| 16 |
self.llm = ChatOpenAI(
|
| 17 |
+
model_name="gpt-4o-mini",
|
| 18 |
api_key=api_key,
|
| 19 |
temperature=TEMPERATURE,
|
| 20 |
max_tokens=MAX_TOKENS,
|
| 21 |
)
|
|
|
|
| 22 |
self.language = LANGUAGE
|
|
|
|
|
|
|
| 23 |
if self.debug:
|
| 24 |
+
print("VisualMultimodalRAG initialized")
|
| 25 |
|
| 26 |
def _debug_print(self, label: str, data: any):
|
|
|
|
| 27 |
if self.debug:
|
| 28 |
+
print(f"DEBUG [{label}]:")
|
| 29 |
if isinstance(data, (list, dict)):
|
| 30 |
+
print(f" Type: {type(data).__name__}")
|
| 31 |
+
print(f" Content: {str(data)[:300]}...")
|
| 32 |
else:
|
| 33 |
+
print(f" {data}")
|
| 34 |
|
| 35 |
def _image_to_base64(self, image_path: str) -> str:
|
|
|
|
| 36 |
try:
|
| 37 |
with open(image_path, 'rb') as image_file:
|
| 38 |
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|
|
|
| 42 |
return None
|
| 43 |
|
| 44 |
def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
|
|
|
|
|
|
|
|
|
|
| 45 |
if not os.path.exists(image_path):
|
| 46 |
+
return f"Image {image_idx}: File not found - {image_path}"
|
|
|
|
| 47 |
try:
|
| 48 |
image_base64 = self._image_to_base64(image_path)
|
| 49 |
if not image_base64:
|
| 50 |
+
return f"Image {image_idx}: Could not convert to base64"
|
|
|
|
| 51 |
file_ext = Path(image_path).suffix.lower()
|
| 52 |
media_type_map = {
|
| 53 |
'.jpg': 'image/jpeg',
|
|
|
|
| 57 |
'.webp': 'image/webp'
|
| 58 |
}
|
| 59 |
media_type = media_type_map.get(file_ext, 'image/png')
|
| 60 |
+
print(f"Analyzing image {image_idx} visually (as {media_type})...")
|
|
|
|
|
|
|
| 61 |
message = HumanMessage(
|
| 62 |
content=[
|
| 63 |
{
|
|
|
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"type": "text",
|
| 71 |
+
"text": f"""You are assistant for analyzing and aggregating information. Analyze this image.
|
| 72 |
+
Provide a visual analysis that includes:
|
| 73 |
+
1. Main objects and element
|
| 74 |
+
2. Data/Content - Any numbers, text, charts, graphs
|
| 75 |
+
3. What this image is showing or representing
|
| 76 |
+
4. Important patterns, trends, or information
|
| 77 |
+
5. How image relates to document content
|
| 78 |
+
Be brief and meaningful. Focus on visual information that cannot be extracted from text. Response on {self.language}.
|
| 79 |
+
Analysis:"""
|
|
|
|
|
|
|
| 80 |
}
|
| 81 |
],
|
| 82 |
)
|
|
|
|
| 83 |
response = self.llm.invoke([message])
|
| 84 |
analysis = response.content.strip()
|
|
|
|
| 85 |
if self.debug:
|
| 86 |
self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
|
| 87 |
+
print(f"Image {image_idx} analyzed successfully")
|
|
|
|
| 88 |
return analysis
|
|
|
|
| 89 |
except Exception as e:
|
| 90 |
+
error_msg = f"Image {image_idx}: Vision analysis failed - {str(e)}"
|
| 91 |
+
print(f"Error analyzing image {image_idx}: {e}")
|
| 92 |
return error_msg
|
| 93 |
|
| 94 |
def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
| 95 |
visual_analyses = []
|
|
|
|
| 96 |
for idx, image in enumerate(images):
|
| 97 |
image_path = image.get('path', '')
|
|
|
|
| 98 |
if not image_path:
|
| 99 |
+
print(f"Image {idx}: No path provided")
|
| 100 |
continue
|
|
|
|
| 101 |
visual_analysis = self.analyze_image_visually(image_path, idx)
|
|
|
|
| 102 |
visual_analyses.append({
|
| 103 |
'type': 'image_visual',
|
| 104 |
'image_index': idx,
|
| 105 |
'image_path': image_path,
|
| 106 |
'visual_analysis': visual_analysis,
|
| 107 |
+
'ocr_text': image.get('ocr_text', '')
|
| 108 |
})
|
|
|
|
| 109 |
return visual_analyses
|
| 110 |
|
| 111 |
def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
| 112 |
chunks = []
|
| 113 |
text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
|
|
|
|
| 114 |
self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
|
|
|
|
| 115 |
for idx, chunk in enumerate(text_chunks):
|
| 116 |
if len(chunk.strip()) < 50:
|
| 117 |
continue
|
|
|
|
| 118 |
try:
|
| 119 |
+
prompt = f"""Summarize this text chunk in {self.language}.
|
| 120 |
+
Be brief and meaningful. Extract key points, facts, and main ideas.
|
| 121 |
+
Text Chunk:
|
|
|
|
| 122 |
{chunk}
|
| 123 |
+
Summary:"""
|
|
|
|
|
|
|
| 124 |
message = HumanMessage(content=prompt)
|
| 125 |
response = self.llm.invoke([message])
|
| 126 |
summary = response.content.strip()
|
|
|
|
| 127 |
chunks.append({
|
| 128 |
'type': 'text_chunk',
|
| 129 |
'chunk_index': len(chunks),
|
|
|
|
| 131 |
'summary': summary,
|
| 132 |
'chunk_length': len(chunk)
|
| 133 |
})
|
|
|
|
| 134 |
if self.debug:
|
| 135 |
self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
|
|
|
|
| 136 |
except Exception as e:
|
| 137 |
print(f"Error summarizing text chunk: {e}")
|
|
|
|
| 138 |
return chunks
|
| 139 |
|
| 140 |
def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
| 141 |
summaries = []
|
|
|
|
| 142 |
for idx, table in enumerate(tables):
|
| 143 |
table_content = table.get('content', '')
|
|
|
|
| 144 |
if not table_content or len(table_content.strip()) < 10:
|
| 145 |
continue
|
|
|
|
| 146 |
try:
|
| 147 |
+
prompt = f"""Analyze and summarize this table/structured data in {self.language}.
|
| 148 |
+
Extract key insights, row/column meanings, and important figures. Be brief and meaningful.
|
| 149 |
+
Table Content:
|
|
|
|
| 150 |
{table_content}
|
| 151 |
+
Summary:"""
|
|
|
|
|
|
|
| 152 |
message = HumanMessage(content=prompt)
|
| 153 |
response = self.llm.invoke([message])
|
| 154 |
summary = response.content.strip()
|
|
|
|
| 155 |
summaries.append({
|
| 156 |
'type': 'table',
|
| 157 |
'table_index': idx,
|
|
|
|
| 159 |
'summary': summary,
|
| 160 |
'table_length': len(table_content)
|
| 161 |
})
|
|
|
|
| 162 |
if self.debug:
|
| 163 |
self._debug_print(f"Table {idx} Summary", summary)
|
|
|
|
| 164 |
except Exception as e:
|
| 165 |
print(f"Error summarizing table {idx}: {e}")
|
|
|
|
| 166 |
return summaries
|
| 167 |
|
| 168 |
def process_and_store_document(
|
| 169 |
+
self,
|
| 170 |
+
text: str,
|
| 171 |
images: List[Dict],
|
| 172 |
tables: List[Dict],
|
| 173 |
vector_store,
|
| 174 |
doc_id: str
|
| 175 |
) -> Dict:
|
| 176 |
+
print("PROCESSING WITH VISUAL IMAGE ANALYSIS: " + doc_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
results = {
|
| 178 |
'doc_id': doc_id,
|
| 179 |
'image_visual_analyses': [],
|
|
|
|
| 181 |
'table_summaries': [],
|
| 182 |
'total_stored': 0
|
| 183 |
}
|
| 184 |
+
print("VISUAL IMAGE ANALYSIS (" + str(len(images)) + " total)")
|
|
|
|
|
|
|
|
|
|
| 185 |
image_analyses = self.analyze_images_visually(images)
|
| 186 |
results['image_visual_analyses'] = image_analyses
|
|
|
|
| 187 |
image_docs = {
|
| 188 |
'text': ' | '.join([
|
| 189 |
+
f"Image {a['image_index']}: {a['visual_analysis']}"
|
| 190 |
for a in image_analyses
|
| 191 |
]),
|
| 192 |
'images': [],
|
| 193 |
'tables': []
|
| 194 |
}
|
|
|
|
| 195 |
for analysis in image_analyses:
|
| 196 |
+
print(f" Image {analysis['image_index']} (visual analysis)")
|
| 197 |
+
print(f" Path: {analysis['image_path']}")
|
| 198 |
+
print(f" Analysis: {analysis['visual_analysis'][:100]}...")
|
|
|
|
| 199 |
if image_analyses:
|
| 200 |
try:
|
| 201 |
vector_store.add_documents(
|
| 202 |
+
image_docs,
|
| 203 |
f"{doc_id}_images_visual"
|
| 204 |
)
|
| 205 |
results['total_stored'] += len(image_analyses)
|
| 206 |
+
print(f"Stored {len(image_analyses)} image visual analyses")
|
| 207 |
except Exception as e:
|
| 208 |
print(f"Error storing image analyses: {e}")
|
| 209 |
+
print("TEXT CHUNK SUMMARIZATION")
|
|
|
|
|
|
|
| 210 |
text_summaries = self.summarize_text_chunks(text)
|
| 211 |
results['text_summaries'] = text_summaries
|
|
|
|
| 212 |
text_docs = {
|
| 213 |
+
'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
|
| 214 |
+
for s in text_summaries]),
|
| 215 |
'images': [],
|
| 216 |
'tables': []
|
| 217 |
}
|
|
|
|
| 218 |
for summary in text_summaries:
|
| 219 |
+
print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
|
|
|
|
| 220 |
if text_summaries:
|
| 221 |
try:
|
| 222 |
vector_store.add_documents(
|
|
|
|
| 224 |
f"{doc_id}_text_chunks"
|
| 225 |
)
|
| 226 |
results['total_stored'] += len(text_summaries)
|
| 227 |
+
print(f"Stored {len(text_summaries)} text chunk summaries")
|
| 228 |
except Exception as e:
|
| 229 |
+
print(f"Error storing text summaries: {e}")
|
| 230 |
+
print("TABLE SUMMARIZATION (" + str(len(tables)) + " total)")
|
|
|
|
|
|
|
| 231 |
table_summaries = self.summarize_tables(tables)
|
| 232 |
results['table_summaries'] = table_summaries
|
|
|
|
| 233 |
table_docs = {
|
| 234 |
+
'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
|
| 235 |
+
for s in table_summaries]),
|
| 236 |
'images': [],
|
| 237 |
'tables': []
|
| 238 |
}
|
|
|
|
| 239 |
for summary in table_summaries:
|
| 240 |
+
print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
|
|
|
|
| 241 |
if table_summaries:
|
| 242 |
try:
|
| 243 |
vector_store.add_documents(
|
|
|
|
| 245 |
f"{doc_id}_tables"
|
| 246 |
)
|
| 247 |
results['total_stored'] += len(table_summaries)
|
| 248 |
+
print(f"Stored {len(table_summaries)} table summaries")
|
| 249 |
except Exception as e:
|
| 250 |
+
print(f"Error storing table summaries: {e}")
|
| 251 |
+
print("STORAGE SUMMARY")
|
| 252 |
+
print(f" Images analyzed and stored: {len(image_analyses)}")
|
| 253 |
+
print(f" Text chunks summarized and stored: {len(text_summaries)}")
|
| 254 |
+
print(f" Tables summarized and stored: {len(table_summaries)}")
|
| 255 |
+
print(f" Total items stored in vector: {results['total_stored']}")
|
|
|
|
|
|
|
|
|
|
| 256 |
return results
|
| 257 |
|
| 258 |
def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
|
|
|
|
| 264 |
start = end - overlap
|
| 265 |
return chunks
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
class AnsweringRAG:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 269 |
api_key = api_key or OPENAI_API_KEY
|
| 270 |
self.debug = debug
|
|
|
|
| 271 |
self.llm = ChatOpenAI(
|
| 272 |
+
model_name="gpt-4o-mini",
|
| 273 |
api_key=api_key,
|
| 274 |
temperature=TEMPERATURE,
|
| 275 |
max_tokens=MAX_TOKENS,
|
| 276 |
)
|
|
|
|
| 277 |
self.language = LANGUAGE
|
|
|
|
|
|
|
| 278 |
if self.debug:
|
| 279 |
+
print("AnsweringRAG initialized")
|
| 280 |
|
| 281 |
def _debug_print(self, label: str, data: any):
|
|
|
|
| 282 |
if self.debug:
|
| 283 |
+
print(f"DEBUG [{label}]:")
|
| 284 |
if isinstance(data, (list, dict)):
|
| 285 |
+
print(f" Type: {type(data).__name__}")
|
| 286 |
+
print(f" Content: {str(data)[:300]}...")
|
| 287 |
else:
|
| 288 |
+
print(f" {data}")
|
| 289 |
|
| 290 |
def analyze_and_answer(
|
| 291 |
+
self,
|
| 292 |
+
question: str,
|
| 293 |
search_results: List[Dict]
|
| 294 |
) -> Dict:
|
| 295 |
+
print("ANALYZING QUESTION & GENERATING ANSWER")
|
| 296 |
+
print(f"Question: {question}")
|
| 297 |
+
print(f"Search Results Found: {len(search_results)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
if not search_results:
|
| 299 |
+
print("No search results found!")
|
| 300 |
+
answer = f"""No relevant information in the document to answer question: "{question}"
|
| 301 |
"""
|
|
|
|
| 302 |
result = {
|
| 303 |
'question': question,
|
| 304 |
'answer': answer,
|
|
|
|
| 306 |
'confidence': 'low',
|
| 307 |
'search_results': []
|
| 308 |
}
|
|
|
|
| 309 |
return result
|
|
|
|
| 310 |
context_parts = []
|
| 311 |
for idx, result in enumerate(search_results, 1):
|
| 312 |
content = result.get('content', '')
|
|
|
|
| 313 |
content_type = result.get('type', 'unknown')
|
| 314 |
distance = result.get('distance', 0)
|
| 315 |
relevance = 1 - distance if distance else 0
|
|
|
|
| 316 |
context_parts.append(f"""
|
| 317 |
[Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
|
| 318 |
{content}""")
|
|
|
|
| 319 |
full_context = "\n".join(context_parts)
|
| 320 |
+
self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
|
| 321 |
+
analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
|
| 322 |
+
USER QUESTION:
|
|
|
|
|
|
|
|
|
|
| 323 |
"{question}"
|
| 324 |
+
RELEVANT CONTENT FROM DOCUMENT:
|
|
|
|
| 325 |
{full_context}
|
| 326 |
+
INSTRUCTIONS:
|
| 327 |
+
1. Analyze the provided content carefully
|
| 328 |
+
2. Extract information relevant to the question
|
| 329 |
+
3. Synthesize a clear, comprehensive answer in {self.language}
|
| 330 |
+
4. If the content doesn't fully answer the question, explain what information is available
|
| 331 |
+
5. Be specific and cite the content when relevant
|
| 332 |
+
6. Structure your answer clearly with key points
|
| 333 |
+
ANSWER:"""
|
| 334 |
+
print("Analyzing search results...")
|
| 335 |
+
print(f" Context size: {len(full_context)} characters")
|
| 336 |
+
print(f" Sources: {len(search_results)}")
|
|
|
|
|
|
|
|
|
|
| 337 |
try:
|
| 338 |
message = HumanMessage(content=analysis_prompt)
|
| 339 |
response = self.llm.invoke([message])
|
| 340 |
answer = response.content.strip()
|
|
|
|
| 341 |
confidence = self._estimate_confidence(len(search_results), answer)
|
| 342 |
+
print("Answer generated successfully")
|
| 343 |
+
print(f" Confidence: {confidence}")
|
| 344 |
+
print(f" Answer length: {len(answer)} characters")
|
|
|
|
|
|
|
| 345 |
result = {
|
| 346 |
'question': question,
|
| 347 |
'answer': answer,
|
| 348 |
'sources_used': len(search_results),
|
| 349 |
'confidence': confidence,
|
| 350 |
+
'search_results': search_results,
|
| 351 |
+
'formatted_sources': self._format_sources(search_results)
|
| 352 |
}
|
|
|
|
|
|
|
| 353 |
return result
|
|
|
|
| 354 |
except Exception as e:
|
| 355 |
+
print(f"Error generating answer: {e}")
|
| 356 |
+
answer = "Error while analyzing the search results"
|
|
|
|
| 357 |
result = {
|
| 358 |
'question': question,
|
| 359 |
'answer': answer,
|
|
|
|
| 362 |
'error': str(e),
|
| 363 |
'search_results': search_results
|
| 364 |
}
|
|
|
|
|
|
|
| 365 |
return result
|
| 366 |
|
| 367 |
def _estimate_confidence(self, sources_count: int, answer: str) -> str:
|
|
|
|
| 368 |
answer_length = len(answer)
|
|
|
|
| 369 |
if sources_count >= 3 and answer_length > 500:
|
| 370 |
return "high"
|
|
|
|
| 371 |
elif sources_count >= 2 and answer_length > 200:
|
| 372 |
return "medium"
|
|
|
|
| 373 |
else:
|
| 374 |
return "low"
|
| 375 |
+
|
| 376 |
+
def _format_sources(self, search_results: List[Dict]) -> List[Dict]:
|
| 377 |
+
formatted_sources = []
|
| 378 |
+
for idx, source in enumerate(search_results, 1):
|
| 379 |
+
formatted_sources.append({
|
| 380 |
+
'index': idx,
|
| 381 |
+
'type': source.get('type', 'unknown'),
|
| 382 |
+
'content': source.get('content', ''),
|
| 383 |
+
'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
|
| 384 |
+
})
|
| 385 |
+
return formatted_sources
|
src/vector_store.py
CHANGED
|
@@ -1,209 +1,88 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Векторное хранилище и Эмбеддер"
|
| 3 |
-
"""
|
| 4 |
import os
|
| 5 |
-
import json
|
| 6 |
from typing import List, Dict
|
|
|
|
| 7 |
import chromadb
|
| 8 |
-
from
|
| 9 |
-
import numpy as np
|
| 10 |
-
from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class CLIPEmbedder:
|
| 14 |
-
"""Эмбеддер"""
|
| 15 |
-
def __init__(self, model_name: str = EMBEDDING_MODEL):
|
| 16 |
-
print(f"Embedding model: {model_name}")
|
| 17 |
-
self.model = SentenceTransformer(model_name)
|
| 18 |
-
print(f"Model loaded successfully")
|
| 19 |
-
|
| 20 |
-
def embed(self, text: str) -> List[float]:
|
| 21 |
-
"""Эмбеддинг для текста"""
|
| 22 |
-
try:
|
| 23 |
-
embedding = self.model.encode(text, convert_to_numpy=False)
|
| 24 |
-
return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
|
| 25 |
-
except Exception as e:
|
| 26 |
-
print(f"Error embedding text: {e}")
|
| 27 |
-
return [0.0] * EMBEDDING_DIM
|
| 28 |
-
|
| 29 |
-
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
| 30 |
-
"""Эмбеддинг для текста"""
|
| 31 |
-
try:
|
| 32 |
-
embeddings = self.model.encode(texts, convert_to_numpy=False)
|
| 33 |
-
return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
|
| 34 |
-
except Exception as e:
|
| 35 |
-
print(f"Error embedding batch: {e}")
|
| 36 |
-
return [[0.0] * EMBEDDING_DIM] * len(texts)
|
| 37 |
-
|
| 38 |
|
| 39 |
class VectorStore:
|
| 40 |
-
"""Векторное хранилище"""
|
| 41 |
def __init__(self):
|
| 42 |
-
self.
|
| 43 |
-
self.
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
try:
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
)
|
| 51 |
-
print(f"
|
| 52 |
except Exception as e:
|
| 53 |
-
print(f"Error
|
| 54 |
-
|
| 55 |
-
path=self.persist_directory
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
try:
|
| 59 |
-
self.collection = self.client.get_or_create_colletion(
|
| 60 |
-
name="multimodal_rag",
|
| 61 |
-
metadata={"hnsw:space": "cosine"}
|
| 62 |
-
)
|
| 63 |
-
count = self.collection.count()
|
| 64 |
-
print(f"Collection loaded: {count} items in store")
|
| 65 |
-
except Exception as e:
|
| 66 |
-
print(f"Error with collection: {e}")
|
| 67 |
-
self.collection = self.client.get_or_create_collection(
|
| 68 |
-
name="multimodal_rag"
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
def add_documents(self, documents: List[Dict], doc_id: str):
|
| 72 |
-
"""Добавление документов в векторное хранилище"""
|
| 73 |
-
texts = []
|
| 74 |
-
metadatas = []
|
| 75 |
-
ids = []
|
| 76 |
-
|
| 77 |
-
print(f"\nAdding document: {doc_id}")
|
| 78 |
-
|
| 79 |
-
if 'text' in documents and documents['text']:
|
| 80 |
-
chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
|
| 81 |
-
for idx, chunk in enumerate(chunks):
|
| 82 |
-
texts.append(chunk)
|
| 83 |
-
metadatas.append({
|
| 84 |
-
'doc_id': doc_id,
|
| 85 |
-
'type': 'text',
|
| 86 |
-
'chunk_idx': str(idx)
|
| 87 |
-
})
|
| 88 |
-
ids.append(f"{doc_id}_text_{idx}")
|
| 89 |
-
print(f" Text: {len(chunks)} chunks")
|
| 90 |
-
|
| 91 |
-
if 'images' in documents:
|
| 92 |
-
image_count = 0
|
| 93 |
-
for idx, image_data in enumerate(documents['images']):
|
| 94 |
-
if image_data.get('ocr_text'):
|
| 95 |
-
texts.append(f"Image {idx}: {image_data['ocr_text']}")
|
| 96 |
-
metadatas.append({
|
| 97 |
-
'doc_id': doc_id,
|
| 98 |
-
'type': 'image',
|
| 99 |
-
'image_idx': str(idx),
|
| 100 |
-
'image_path': image_data.get('path', '')
|
| 101 |
-
})
|
| 102 |
-
ids.append(f"{doc_id}_image_{idx}")
|
| 103 |
-
image_count += 1
|
| 104 |
-
if image_count > 0:
|
| 105 |
-
print(f" Images: {image_count} with OCR text")
|
| 106 |
-
|
| 107 |
-
if 'tables' in documents:
|
| 108 |
-
table_count = 0
|
| 109 |
-
for idx, table_data in enumerate(documents['tables']):
|
| 110 |
-
if table_data.get('content'):
|
| 111 |
-
texts.append(f"Table {idx}: {table_data.get('content', '')}")
|
| 112 |
-
metadatas.append({
|
| 113 |
-
'doc_id': doc_id,
|
| 114 |
-
'type': 'table',
|
| 115 |
-
'table_idx': str(idx)
|
| 116 |
-
})
|
| 117 |
-
ids.append(f"{doc_id}_table_{idx}")
|
| 118 |
-
table_count += 1
|
| 119 |
-
if table_count > 0:
|
| 120 |
-
print(f" Tables: {table_count}")
|
| 121 |
-
|
| 122 |
-
if texts:
|
| 123 |
-
print(f" 🔄 Generating {len(texts)} embeddings...")
|
| 124 |
-
embeddings = self.embedder.embed_batch(texts)
|
| 125 |
-
|
| 126 |
-
try:
|
| 127 |
-
self.collection.add(
|
| 128 |
-
ids=ids,
|
| 129 |
-
documents=texts,
|
| 130 |
-
embeddings=embeddings,
|
| 131 |
-
metadatas=metadatas
|
| 132 |
-
)
|
| 133 |
-
print(f"Successfully added {len(texts)} items to vector store")
|
| 134 |
-
except Exception as e:
|
| 135 |
-
print(f"Error adding to collection: {e}")
|
| 136 |
|
| 137 |
def search(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 138 |
-
"""Поиск в векторном хранилище"""
|
| 139 |
try:
|
| 140 |
-
query_embedding = self.embedder.embed(query)
|
| 141 |
-
|
| 142 |
results = self.collection.query(
|
| 143 |
-
|
| 144 |
-
n_results=n_results
|
|
|
|
| 145 |
)
|
| 146 |
-
|
| 147 |
formatted_results = []
|
| 148 |
-
if results['documents']:
|
| 149 |
-
for
|
| 150 |
-
|
| 151 |
-
distance = results['distances'][0][i] if results['distances'] else 0
|
| 152 |
-
|
| 153 |
formatted_results.append({
|
| 154 |
'content': doc,
|
| 155 |
-
'metadata':
|
| 156 |
'distance': distance,
|
| 157 |
-
'type':
|
| 158 |
})
|
| 159 |
-
|
| 160 |
return formatted_results
|
| 161 |
except Exception as e:
|
| 162 |
print(f"Error searching vector store: {e}")
|
| 163 |
return []
|
| 164 |
|
| 165 |
-
def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
| 166 |
-
"""Сплит текста"""
|
| 167 |
-
chunks = []
|
| 168 |
-
start = 0
|
| 169 |
-
while start < len(text):
|
| 170 |
-
end = start + chunk_size
|
| 171 |
-
chunks.append(text[start:end])
|
| 172 |
-
start = end - overlap
|
| 173 |
-
return chunks
|
| 174 |
-
|
| 175 |
def get_collection_info(self) -> Dict:
|
| 176 |
-
"""Получение информации о коллекции в вектороном хранилище"""
|
| 177 |
try:
|
| 178 |
count = self.collection.count()
|
| 179 |
return {
|
| 180 |
-
'name': 'multimodal_rag',
|
| 181 |
'count': count,
|
| 182 |
-
'status': '
|
| 183 |
-
'persist_path': self.
|
| 184 |
}
|
| 185 |
except Exception as e:
|
| 186 |
print(f"Error getting collection info: {e}")
|
| 187 |
-
return {
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
results = self.collection.get(where={'doc_id': doc_id})
|
| 193 |
-
if results['ids']:
|
| 194 |
-
self.collection.delete(ids=results['ids'])
|
| 195 |
-
print(f"Deleted {len(results['ids'])} documents for {doc_id}")
|
| 196 |
-
except Exception as e:
|
| 197 |
-
print(f"Error deleting documents: {e}")
|
| 198 |
|
| 199 |
def clear_all(self):
|
| 200 |
-
"""Очистка хранилища"""
|
| 201 |
try:
|
| 202 |
-
self.client.delete_collection(name="
|
| 203 |
self.collection = self.client.get_or_create_collection(
|
| 204 |
-
name="
|
| 205 |
metadata={"hnsw:space": "cosine"}
|
| 206 |
)
|
| 207 |
-
print("
|
| 208 |
except Exception as e:
|
| 209 |
-
print(f"Error clearing
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
from typing import List, Dict
|
| 3 |
+
from chromadb.config import Settings
|
| 4 |
import chromadb
|
| 5 |
+
from config import CHROMA_DB_PATH
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
class VectorStore:
|
|
|
|
| 8 |
def __init__(self):
|
| 9 |
+
self.chroma_path = CHROMA_DB_PATH
|
| 10 |
+
self.settings = Settings(
|
| 11 |
+
chroma_db_impl_embed_collection_mixin=True,
|
| 12 |
+
persist_directory=self.chroma_path,
|
| 13 |
+
anonymized_telemetry=False,
|
| 14 |
+
allow_reset=True,
|
| 15 |
+
)
|
| 16 |
+
self.client = chromadb.Client(self.settings)
|
| 17 |
+
self.collection = self.client.get_or_create_collection(
|
| 18 |
+
name="documents",
|
| 19 |
+
metadata={"hnsw:space": "cosine"}
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
def add_documents(self, documents: Dict, doc_id: str):
|
| 23 |
try:
|
| 24 |
+
text = documents.get('text', '')
|
| 25 |
+
if not text or len(text.strip()) < 1:
|
| 26 |
+
print(f"Empty text for {doc_id}")
|
| 27 |
+
return
|
| 28 |
+
self.collection.add(
|
| 29 |
+
ids=[doc_id],
|
| 30 |
+
documents=[text],
|
| 31 |
+
metadatas=[{
|
| 32 |
+
'doc_id': doc_id,
|
| 33 |
+
'source': 'pdf_document'
|
| 34 |
+
}]
|
| 35 |
)
|
| 36 |
+
print(f"Added document to vector store: {doc_id}")
|
| 37 |
except Exception as e:
|
| 38 |
+
print(f"Error adding documents to vector store: {e}")
|
| 39 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def search(self, query: str, n_results: int = 5) -> List[Dict]:
|
|
|
|
| 42 |
try:
|
|
|
|
|
|
|
| 43 |
results = self.collection.query(
|
| 44 |
+
query_texts=[query],
|
| 45 |
+
n_results=n_results,
|
| 46 |
+
include=['documents', 'metadatas', 'distances', 'embeddings']
|
| 47 |
)
|
|
|
|
| 48 |
formatted_results = []
|
| 49 |
+
if results and results['documents'] and len(results['documents']) > 0:
|
| 50 |
+
for idx, doc in enumerate(results['documents'][0]):
|
| 51 |
+
distance = results['distances'][0][idx] if results['distances'] else 0
|
|
|
|
|
|
|
| 52 |
formatted_results.append({
|
| 53 |
'content': doc,
|
| 54 |
+
'metadata': results['metadatas'][0][idx] if results['metadatas'] else {},
|
| 55 |
'distance': distance,
|
| 56 |
+
'type': 'document'
|
| 57 |
})
|
|
|
|
| 58 |
return formatted_results
|
| 59 |
except Exception as e:
|
| 60 |
print(f"Error searching vector store: {e}")
|
| 61 |
return []
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def get_collection_info(self) -> Dict:
|
|
|
|
| 64 |
try:
|
| 65 |
count = self.collection.count()
|
| 66 |
return {
|
|
|
|
| 67 |
'count': count,
|
| 68 |
+
'status': 'ready',
|
| 69 |
+
'persist_path': self.chroma_path
|
| 70 |
}
|
| 71 |
except Exception as e:
|
| 72 |
print(f"Error getting collection info: {e}")
|
| 73 |
+
return {
|
| 74 |
+
'count': 0,
|
| 75 |
+
'status': 'error',
|
| 76 |
+
'persist_path': self.chroma_path
|
| 77 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
def clear_all(self):
|
|
|
|
| 80 |
try:
|
| 81 |
+
self.client.delete_collection(name="documents")
|
| 82 |
self.collection = self.client.get_or_create_collection(
|
| 83 |
+
name="documents",
|
| 84 |
metadata={"hnsw:space": "cosine"}
|
| 85 |
)
|
| 86 |
+
print("Vector store cleared")
|
| 87 |
except Exception as e:
|
| 88 |
+
print(f"Error clearing vector store: {e}")
|