final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 19, 2025

Commit

34bfedc

1 Parent(s): 00cf41b

Simplify

Browse files

Files changed (5) hide show

src/app.py +82 -107
src/config.py +11 -7
src/pdf_parser.py +24 -70
src/rag_system.py +106 -239
src/vector_store.py +49 -170

src/app.py CHANGED Viewed

@@ -1,33 +1,25 @@
-"""
-UI RAG
-"""
 import streamlit as st
 import os
 from pathlib import Path
 from pdf_parser import PDFParser
 from vector_store import VectorStore
-from rag_system import VisualMultimodalRAG
 from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
 st.set_page_config(
-    page_title="Мультимодальная RAG система (PDF parsing)",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 if 'api_key_set' not in st.session_state:
     st.session_state.api_key_set = False
 if 'api_key' not in st.session_state:
     st.session_state.api_key = None
-if 'visual_rag_system' not in st.session_state:
     st.session_state.visual_rag_system = None
 if 'vector_store' not in st.session_state:
@@ -54,22 +46,20 @@ if 'processing_results' not in st.session_state:
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
-st.title("Мультимодальная RAG система (PDF parsing)")
 st.markdown("""
-Обрабатывает PDF документы и предоставляет информацию по ним
-""")
 with st.sidebar:
-    st.header(" Конфигурация")
-    st.subheader(" OpenAI API Ключ")
     api_key = st.text_input(
-        "Введите OpenAI API ключ:",
         type="password",
         key="api_key_input"
     )
@@ -80,70 +70,65 @@ with st.sidebar:
         if st.session_state.visual_rag_system is None:
             try:
-                st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)  # NEW
                 st.session_state.vector_store = VectorStore()
                 st.session_state.parser = PDFParser(debug=True)
-                st.success("API ключ введен")
             except Exception as e:
-                st.error(f"Ошибка старта системы: {e}")
     else:
         st.session_state.api_key_set = False
-        st.warning("Введите OpenAI API ключ")
     st.divider()
     st.subheader("Векторное хранилище")
     if st.session_state.vector_store:
         try:
             info = st.session_state.vector_store.get_collection_info()
-            st.metric("Документов в хранилище", info['count'])
-            st.caption(f"Расположение: {info['persist_path']}")
         except Exception as e:
-            st.error(f"Ошибка получения информации: {e}")
     else:
-        st.info("Введите OpenAI API ключ")
     st.divider()
-    st.subheader("Управление хранилищем")
-    if st.button("Очистить хранилище"):
         if st.session_state.vector_store:
             try:
                 st.session_state.vector_store.clear_all()
-                st.success("Хранилище очищено")
             except Exception as e:
-                st.error(f"Ошибка очистки хранилища: {e}")
-st.header("Загрузить PDF")
 uploaded_file = st.file_uploader(
-    "Выбрать...",
     type=['pdf'],
-    help="Загрузите PDF файл"
 )
 if uploaded_file is not None:
     upload_path = Path(UPLOAD_FOLDER)
     upload_path.mkdir(exist_ok=True)
     file_path = upload_path / uploaded_file.name
     with open(file_path, 'wb') as f:
         f.write(uploaded_file.getbuffer())
-    st.success(f"Файл загружен: {uploaded_file.name}")
     if st.button("Распарсить PDF"):
         if not st.session_state.api_key_set:
-            st.error("Введите OpenAI API ключ")
         else:
             try:
-                with st.spinner(" Парсинг PDF..."):
-                    print(f"Парсинг PDF файла: {uploaded_file.name}")
                     parser = st.session_state.parser
                     text, images, tables = parser.parse_pdf(str(file_path))
@@ -154,42 +139,45 @@ if uploaded_file is not None:
                     col1, col2, col3 = st.columns(3)
                     with col1:
-                        st.metric("Текста", f"{len(text):,} chars")
                     with col2:
-                        st.metric("Изображений", len(images))
                     with col3:
-                        st.metric("Таблиц", len(tables))
-                    st.success("Парсинг PDF завершен!")
             except Exception as e:
-                st.error(f"Парсинг PDF завершелся с ошибкой: {e}")
-                print(f"Ошибка: {e}")
 st.divider()
-st.header("Анализ документа")
-st.info("""
-Отправляет содержимое документа на анализ
-""")
-if st.button("Проанализировать документ"):
     if not st.session_state.api_key_set:
-        st.error("Введите OpenAI API ключ")
     elif st.session_state.current_text is None:
-        st.error("Распарсите документ")
     else:
         try:
-            with st.spinner("Анализ с gpt-4o-mini..."):
                 visual_rag = st.session_state.visual_rag_system
                 vector_store = st.session_state.vector_store
                 results = visual_rag.process_and_store_document(
                     text=st.session_state.current_text,
-                    images=st.session_state.current_images,
                     tables=st.session_state.current_tables,
                     vector_store=vector_store,
                     doc_id=st.session_state.current_document or "current_doc"
@@ -197,53 +185,45 @@ if st.button("Проанализировать документ"):
                 st.session_state.processing_results = results
-                st.success("Анализ готов!")
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
                 with col2:
-                    st.metric("Проанализировано чанков текста", len(results['text_summaries']))
                 with col3:
                     st.metric("Проанализировано та��лиц", len(results['table_summaries']))
-                st.metric("Помещено в хранилище", results['total_stored'])
-                print(f"Анализ завершен")
         except Exception as e:
-            st.error(f"Ошибка в ходе: {e}")
-            print(f"Ошибка: {e}")
 st.divider()
-st.header("Работа с документом")
-if 'answering_rag' not in st.session_state:
-    st.session_state.answering_rag = None
 if st.session_state.api_key_set and st.session_state.answering_rag is None:
-    from rag_system import AnsweringRAG
     st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
 question = st.text_area(
-    "Введите запрос:",
     height=100,
-    placeholder="О чем данный документ?"
 )
-if st.button("Генерация ответа"):
     if not st.session_state.api_key_set:
-        st.error("Введите OpenAI API ключ")
     elif st.session_state.current_text is None:
-        st.error("Распарсите документ")
     elif not question:
-        st.error("Введите запрос")
     else:
         try:
-            with st.spinner("Поиск документов..."):
                 store = st.session_state.vector_store
                 doc_name = st.session_state.current_document or "current_doc"
@@ -252,55 +232,50 @@ if st.button("Генерация ответа"):
                     'images': [],
                     'tables': []
                 }
                 store.add_documents(doc_data, doc_name)
                 search_results = store.search(question, n_results=5)
-                print(f"Найдено: {len(search_results)}")
                 answering_rag = st.session_state.answering_rag
                 result = answering_rag.analyze_and_answer(question, search_results)
-                st.success("Поиск завершен!")
                 st.subheader("Ответ")
                 col1, col2, col3 = st.columns(3)
                 with col1:
-                    confidence_color = {
-                        'high': '🟢',
-                        'medium': '🟡',
-                        'low': '🔴'
-                    }.get(result['confidence'], '⚪')
-                    st.metric("Уверенность в ответе", f"{confidence_color} {result['confidence'].upper()}")
                 with col2:
                     st.metric("Использовано источников", result['sources_used'])
                 with col3:
                     if result['sources_used'] > 0:
-                        st.metric("Среднняя релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
                 st.write(result['answer'])
                 if st.checkbox("Показать исходные документы"):
-                    st.subheader("Использованы документы")
-                    for idx, source in enumerate(result['formatted_sources'], 1):
                         relevance = source['relevance']
-                        relevance_bar = "|" * int(relevance * 10) + "|" * (10 - int(relevance * 10))
                         with st.expander(
-                            f"Источник {idx} - {source['type'].upper()} "
                             f"[{relevance_bar}] {relevance:.0%}"
                         ):
                             st.write(source['content'])
-                print(f" Ответ готов!")
         except Exception as e:
-            st.error(f"Ошибка обработки запроса: {e}")
-            print(f"Ошибка: {e}")
 st.divider()
 st.caption(
-    "Мультимодальная RAG система для парсинга PDF документов"
 )

 import streamlit as st
 import os
 from pathlib import Path
 from pdf_parser import PDFParser
 from vector_store import VectorStore
+from rag_system import VisualMultimodalRAG, AnsweringRAG
 from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
 st.set_page_config(
+    page_title="Мультимодальная система RAG LLM",
+    page_icon="",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 if 'api_key_set' not in st.session_state:
     st.session_state.api_key_set = False
 if 'api_key' not in st.session_state:
     st.session_state.api_key = None
+if 'visual_rag_system' not in st.session_state:
     st.session_state.visual_rag_system = None
 if 'vector_store' not in st.session_state:
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
+st.title("Мультимодальная система RAG LLM")
 st.markdown("""
+Обработка PDF-документов с анализом визуального контента
+""")
 with st.sidebar:
+    st.header("Конфигурация")
+    st.subheader("Ключ API OpenAI")
     api_key = st.text_input(
+        "Введите ваш ключ API OpenAI:",
         type="password",
         key="api_key_input"
     )
         if st.session_state.visual_rag_system is None:
             try:
+                st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)
                 st.session_state.vector_store = VectorStore()
                 st.session_state.parser = PDFParser(debug=True)
+                st.success("Ключ API установлен")
             except Exception as e:
+                st.error(f"Ошибка при инициализации систем: {e}")
     else:
         st.session_state.api_key_set = False
+        st.warning("Введите ключ API для продолжения")
     st.divider()
     st.subheader("Векторное хранилище")
     if st.session_state.vector_store:
         try:
             info = st.session_state.vector_store.get_collection_info()
+            st.metric("Элементов в хранилище", info['count'])
+            st.caption(f"Путь: {info['persist_path']}")
         except Exception as e:
+            st.error(f"Ошибка получения информации о хранилище: {e}")
     else:
+        st.info("Установите ключ API для инициализации векторного хранилища")
     st.divider()
+    st.subheader("Управление документами")
+    if st.button("Очистить векторное хранилище"):
         if st.session_state.vector_store:
             try:
                 st.session_state.vector_store.clear_all()
+                st.success("Векторное хранилище очищено")
             except Exception as e:
+                st.error(f"Ошибка при очистке хранилища: {e}")
+st.header("Загрузка PDF-документа")
 uploaded_file = st.file_uploader(
+    "Выберите PDF-файл",
     type=['pdf'],
+    help="PDF с текстом, изображениями и таблицами"
 )
 if uploaded_file is not None:
     upload_path = Path(UPLOAD_FOLDER)
     upload_path.mkdir(exist_ok=True)
     file_path = upload_path / uploaded_file.name
     with open(file_path, 'wb') as f:
         f.write(uploaded_file.getbuffer())
+    st.success(f"Файл сохранён: {uploaded_file.name}")
     if st.button("Распарсить PDF"):
         if not st.session_state.api_key_set:
+            st.error("Введите ключ API для продолжения")
         else:
             try:
+                with st.spinner("Парсинг PDF..."):
+                    print("PARSING: " + uploaded_file.name)
                     parser = st.session_state.parser
                     text, images, tables = parser.parse_pdf(str(file_path))
                     col1, col2, col3 = st.columns(3)
                     with col1:
+                        st.metric("Текст", f"{len(text):,} символов")
                     with col2:
+                        st.metric("Изображения", len(images))
                     with col3:
+                        st.metric("Таблицы", len(tables))
+                    # if images:
+                    #     st.subheader("Извлечённые изображения")
+                    #     for idx, img in enumerate(images):
+                    #         ocr_text = img.get('ocr_text', '')
+                    #         ocr_len = len(ocr_text)
+                    #         if ocr_len > 0:
+                    #             st.success(f"Изображение {idx}: {ocr_len} символов (OCR)")
+                    #         else:
+                    #             st.warning(f"Изображение {idx}: Текст OCR не найден (будет использоваться визуальный анализ)")
+                    st.success("Парсинг PDF завершён!")
             except Exception as e:
+                st.error(f"Ошибка при парсинге PDF: {e}")
 st.divider()
+st.header("Анализ")
+if st.button("Анализировать"):
     if not st.session_state.api_key_set:
+        st.error("Введите ключ API для продолжения")
     elif st.session_state.current_text is None:
+        st.error("Распарсьте PDF-документ")
     else:
         try:
+            with st.spinner("Анализ изображений с помощью gpt-4o-mini..."):
+                print("ANALYSIS")
                 visual_rag = st.session_state.visual_rag_system
                 vector_store = st.session_state.vector_store
                 results = visual_rag.process_and_store_document(
                     text=st.session_state.current_text,
+                    images=st.session_state.current_images,
                     tables=st.session_state.current_tables,
                     vector_store=vector_store,
                     doc_id=st.session_state.current_document or "current_doc"
                 st.session_state.processing_results = results
+                st.success("Анализ завершён и сохранён!")
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
                 with col2:
+                    st.metric("Фрагментов текста", len(results['text_summaries']))
                 with col3:
                     st.metric("Проанализировано та��лиц", len(results['table_summaries']))
+                st.metric("Всего сохранено в вектор", results['total_stored'])
         except Exception as e:
+            st.error(f"Ошибка при анализе: {e}")
 st.divider()
+st.header("Задать вопрос о документе")
 if st.session_state.api_key_set and st.session_state.answering_rag is None:
     st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
 question = st.text_area(
+    "Введите ваш вопрос:",
     height=100,
+    placeholder="О чем говорится в документе?"
 )
+if st.button("Поиск и генерация ответа"):
     if not st.session_state.api_key_set:
+        st.error("Введите ключ API для продолжения")
     elif st.session_state.current_text is None:
+        st.error("Распарсьте PDF-документ")
     elif not question:
+        st.error("Введите вопрос")
     else:
         try:
+            with st.spinner("Поиск в документе и анализ..."):
+                print("QUESTION: " + question)
                 store = st.session_state.vector_store
                 doc_name = st.session_state.current_document or "current_doc"
                     'images': [],
                     'tables': []
                 }
                 store.add_documents(doc_data, doc_name)
                 search_results = store.search(question, n_results=5)
                 answering_rag = st.session_state.answering_rag
                 result = answering_rag.analyze_and_answer(question, search_results)
+                st.success("Анализ завершён!")
                 st.subheader("Ответ")
                 col1, col2, col3 = st.columns(3)
                 with col1:
+                    confidence_map = {
+                        'high': 'ВЫСОКАЯ',
+                        'medium': 'СРЕДНЯЯ',
+                        'low': 'НИЗКАЯ'
+                    }
+                    confidence_text = confidence_map.get(result['confidence'], result['confidence'].upper())
+                    st.metric("Уверенность", confidence_text)
                 with col2:
                     st.metric("Использовано источников", result['sources_used'])
                 with col3:
                     if result['sources_used'] > 0:
+                        st.metric("Сред. релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
                 st.write(result['answer'])
                 if st.checkbox("Показать исходные документы"):
+                    st.subheader("Источники, использованные в ответе")
+                    for source in result.get('formatted_sources', []):
                         relevance = source['relevance']
+                        relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
                         with st.expander(
+                            f"Источник {source['index']} - {source['type'].upper()} "
                             f"[{relevance_bar}] {relevance:.0%}"
                         ):
                             st.write(source['content'])
         except Exception as e:
+            st.error(f"Ошибка при обработке вопроса: {e}")
 st.divider()
 st.caption(
+    "Мультимодальная система RAG"
 )

src/config.py CHANGED Viewed

@@ -1,34 +1,38 @@
-"""
-Конфигурационный файл
-"""
 import os
 from pathlib import Path
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_MODEL = "gpt-4o-mini"
 USE_CACHE = True
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
 MAX_CHUNK_SIZE = 500
 CHUNK_OVERLAP = 50
 TEMPERATURE = 0.3
 MAX_TOKENS = 500
 LANGUAGE = "russian"
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
-MAX_PDF_SIZE_MB = 50
-BATCH_SEARCH_RESULTS = 3
-CACHE_RESPONSES = True
-SUMMARIZE_FIRST = True

 import os
 from pathlib import Path
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_MODEL = "gpt-4o-mini"
 USE_CACHE = True
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
 MAX_CHUNK_SIZE = 500
 CHUNK_OVERLAP = 50
 TEMPERATURE = 0.3
 MAX_TOKENS = 500
 LANGUAGE = "russian"
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
+MAX_PDF_SIZE_MB = 50

src/pdf_parser.py CHANGED Viewed

@@ -1,6 +1,3 @@
-"""
-PDF Парсер
-"""
 import os
 import json
 import hashlib
@@ -12,34 +9,37 @@ from PIL import Image
 import pytesseract
 from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
 class PDFParser:
     def __init__(self, debug: bool = True):
         self.docstore_path = Path(DOCSTORE_PATH)
         self.docstore_path.mkdir(exist_ok=True)
         self.processed_files = self._load_processed_files()
         self.debug = debug
         if self.debug:
             print("PDFParser initialized")
     def _debug_print(self, label: str, data: any):
-        """Debug"""
         if self.debug:
-            print(f"\n🔍 [PDF Parser] {label}")
             if isinstance(data, dict):
                 for key, val in data.items():
-                    print(f"  {key}: {val}")
             elif isinstance(data, (list, tuple)):
-                print(f"  Count: {len(data)}")
                 for i, item in enumerate(data[:3]):
-                    print(f"  [{i}]: {str(item)[:100]}")
             else:
-                print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
-        """Подгрузка обработанных файлов"""
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -49,12 +49,10 @@ class PDFParser:
         return {}
     def _save_processed_files(self):
-        """Сохранение обработанных файлов"""
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
-        """Проверка изменения файлов"""
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
@@ -62,56 +60,43 @@ class PDFParser:
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
-        """Извлечение текста из PDF"""
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
                 reader = PyPDF2.PdfReader(file)
                 page_count = len(reader.pages)
                 self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
                 for page_num, page in enumerate(reader.pages):
                     page_text = page.extract_text()
                     text += page_text + "\n"
                     self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
         except Exception as e:
             self._debug_print("ERROR extracting text", str(e))
         self._debug_print("Total Text Extracted", len(text))
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Извлечение изображений из PDF"""
         images_data = []
         try:
-            self._debug_print("Image extraction", f"File: {pdf_path}")
             images = convert_from_path(pdf_path, dpi=150)
-            self._debug_print(f"Total images: {len(images)}")
             for idx, image in enumerate(images):
-                self._debug_print(f"Image {idx}", f"Size: {image.size}")
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
-                self._debug_print(f"Image {idx} OCR", "Running OCR...")
                 try:
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
-                        self._debug_print(f"Image {idx} OCR Result", f"WARN ({len(ocr_text)} chars)")
                     else:
-                        self._debug_print(f"Image {idx} OCR Result", f"SUCCESS {len(ocr_text)} chars: {ocr_text[:150]}")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
-                    ocr_text = f"[Image {idx}: OCR failed {str(ocr_error)}]"
                 images_data.append({
                     'page': idx,
                     'path': str(image_path),
@@ -120,19 +105,15 @@ class PDFParser:
                 })
         except Exception as e:
             self._debug_print("ERROR extracting images", str(e))
         self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Извлечение таблиц из PDF"""
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
             lines = text.split('\n')
-            self._debug_print("Table extraction", f"Scanning {len(lines)} lines")
             current_table = []
             for line in lines:
                 if '|' in line or '\t' in line:
@@ -144,53 +125,40 @@ class PDFParser:
                             'description': f"Table {len(tables_data) + 1}"
                         })
                     current_table = []
             if current_table and len(current_table) > 1:
                 tables_data.append({
                     'content': '\n'.join(current_table),
                     'description': f"Table {len(tables_data) + 1}"
                 })
             self._debug_print("Tables Found", len(tables_data))
         except Exception as e:
             self._debug_print("ERROR extracting tables", str(e))
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
-        """Парсинг PDF"""
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
         self._debug_print("PDF Parsing Started", f"File: {doc_id}")
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
                 self._debug_print("Status", f"File {doc_id} already processed")
                 return self._load_extracted_data(doc_id)
-        print(f"\nProcessing PDF: {doc_id}")
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
-        self._debug_print("Summary", {
             'text_length': len(text),
             'images_count': len(images),
             'tables_count': len(tables),
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
         self._save_extracted_data(doc_id, text, images, tables)
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
-        """Сохранение извелеченных данных в Docstore"""
         data = {
             'text': text,
             'images': images,
@@ -199,27 +167,13 @@ class PDFParser:
         data_path = self.docstore_path / f"{doc_id}_data.json"
         with open(data_path, 'w', encoding='utf-8') as f:
             json.dump(data, f, ensure_ascii=False, indent=2)
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
-        """Подгрузка ранее извлеченных данных из Docstore"""
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             return data['text'], data['images'], data['tables']
         except:
-            return "", [], []
-    def get_all_documents(self) -> Dict:
-        """Получение всех документов из Docstore"""
-        all_docs = {}
-        for json_file in self.docstore_path.glob("*_data.json"):
-            doc_id = json_file.stem.replace("_data", "")
-            try:
-                with open(json_file, 'r', encoding='utf-8') as f:
-                    all_docs[doc_id] = json.load(f)
-            except:
-                pass
-        return all_docs

 import os
 import json
 import hashlib
 import pytesseract
 from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
 class PDFParser:
     def __init__(self, debug: bool = True):
         self.docstore_path = Path(DOCSTORE_PATH)
         self.docstore_path.mkdir(exist_ok=True)
         self.processed_files = self._load_processed_files()
         self.debug = debug
+        self._configure_tesseract()
         if self.debug:
             print("PDFParser initialized")
+    def _configure_tesseract(self):
+        try:
+            pytesseract.get_tesseract_version()
+            print("Tesseract configured successfully")
+        except Exception as e:
+            print(f"Tesseract configuration warning: {e}")
     def _debug_print(self, label: str, data: any):
         if self.debug:
+            print(f"[PDF Parser] {label}")
             if isinstance(data, dict):
                 for key, val in data.items():
+                    print(f" {key}: {val}")
             elif isinstance(data, (list, tuple)):
+                print(f" Count: {len(data)}")
                 for i, item in enumerate(data[:3]):
+                    print(f" [{i}]: {str(item)[:100]}")
             else:
+                print(f" {data}")
     def _load_processed_files(self) -> Dict[str, str]:
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
         return {}
     def _save_processed_files(self):
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
                 reader = PyPDF2.PdfReader(file)
                 page_count = len(reader.pages)
                 self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
                 for page_num, page in enumerate(reader.pages):
                     page_text = page.extract_text()
                     text += page_text + "\n"
                     self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
         except Exception as e:
             self._debug_print("ERROR extracting text", str(e))
         self._debug_print("Total Text Extracted", len(text))
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         images_data = []
         try:
+            self._debug_print("Image Extraction Started", f"File: {pdf_path}")
             images = convert_from_path(pdf_path, dpi=150)
+            self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
             for idx, image in enumerate(images):
+                self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
+                self._debug_print(f"Image {idx} OCR")
                 try:
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
+                        self._debug_print(f"Image {idx} OCR Result", f"EMPTY ({len(ocr_text)} chars)")
                     else:
+                        self._debug_print(f"Image {idx} OCR Result", f"Success - {len(ocr_text)} chars: {ocr_text[:150]}")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
+                    ocr_text = f"Image {idx}: OCR failed - {str(ocr_error)}"
                 images_data.append({
                     'page': idx,
                     'path': str(image_path),
                 })
         except Exception as e:
             self._debug_print("ERROR extracting images", str(e))
         self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
             lines = text.split('\n')
+            self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
             current_table = []
             for line in lines:
                 if '|' in line or '\t' in line:
                             'description': f"Table {len(tables_data) + 1}"
                         })
                     current_table = []
             if current_table and len(current_table) > 1:
                 tables_data.append({
                     'content': '\n'.join(current_table),
                     'description': f"Table {len(tables_data) + 1}"
                 })
             self._debug_print("Tables Found", len(tables_data))
         except Exception as e:
             self._debug_print("ERROR extracting tables", str(e))
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
         self._debug_print("PDF Parsing Started", f"File: {doc_id}")
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
                 self._debug_print("Status", f"File {doc_id} already processed")
                 return self._load_extracted_data(doc_id)
+        print(f"Processing PDF: {doc_id}")
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
+        self._debug_print("Extraction Summary", {
             'text_length': len(text),
             'images_count': len(images),
             'tables_count': len(tables),
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
         self._save_extracted_data(doc_id, text, images, tables)
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
         data = {
             'text': text,
             'images': images,
         data_path = self.docstore_path / f"{doc_id}_data.json"
         with open(data_path, 'w', encoding='utf-8') as f:
             json.dump(data, f, ensure_ascii=False, indent=2)
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             return data['text'], data['images'], data['tables']
         except:
+            return "", [], []

src/rag_system.py CHANGED Viewed

@@ -1,55 +1,38 @@
-"""
-RAG основной pipeline
-"""
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
-from langchain_core.messages import HumanMessage, SystemMessage
 import base64
 import os
 from pathlib import Path
 from config import (
-    OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
     LANGUAGE, CHROMA_DB_PATH
 )
 class VisualMultimodalRAG:
-    """
-    RAG - подготовительный этап:
-    1. Кодирует изображение в base64 и отправляет в gpt-4o-mini
-    2. Получает описание изображения
-    3. Сохраняет описание в векторное хранилище
-    """
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
-            model_name=OPENAI_MODEL,
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         )
         self.language = LANGUAGE
-        self.visual_summaries_log = []
         if self.debug:
-            print(f"VisualMultimodalRAG with {OPENAI_MODEL}")
     def _debug_print(self, label: str, data: any):
-        """Debug"""
         if self.debug:
-            print(f"\nDEBUG [{label}]:")
             if isinstance(data, (list, dict)):
-                print(f"  Type: {type(data).__name__}")
-                print(f"  Content: {str(data)[:300]}...")
             else:
-                print(f"  {data}")
     def _image_to_base64(self, image_path: str) -> str:
-        """Конвертирует изображение в base64"""
         try:
             with open(image_path, 'rb') as image_file:
                 image_data = base64.b64encode(image_file.read()).decode('utf-8')
@@ -59,17 +42,12 @@ class VisualMultimodalRAG:
             return None
     def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
-        """
-        Отправляет в модель изображение для суммаризации
-        """
         if not os.path.exists(image_path):
-            return f"[Image {image_idx}: File not found - {image_path}]"
         try:
             image_base64 = self._image_to_base64(image_path)
             if not image_base64:
-                return f"[Image {image_idx}: Error converting to base64]"
             file_ext = Path(image_path).suffix.lower()
             media_type_map = {
                 '.jpg': 'image/jpeg',
@@ -79,9 +57,7 @@ class VisualMultimodalRAG:
                 '.webp': 'image/webp'
             }
             media_type = media_type_map.get(file_ext, 'image/png')
-            print(f" Analyzing image {image_idx}...")
             message = HumanMessage(
                 content=[
                     {
@@ -92,86 +68,62 @@ class VisualMultimodalRAG:
                     },
                     {
                         "type": "text",
-                        "text": f"""Ты - ассистент по сбору и обобщению информации. Проанализируй изображение.
-По результатам анализа предоставь информацию:
-1. Что изображено на картинке - основные объекты и элементы
-2. Тип данных и содержимое - числа, графики, зависимости.
-3. Назначение изображения - для чего оно представлено и что отображает
-4. Связь с текстом
-Будь краток и содержателен. Фокусируйся на визуальной информации.
-Результат:"""
                     }
                 ],
             )
             response = self.llm.invoke([message])
             analysis = response.content.strip()
             if self.debug:
                 self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
-            print(f" Image {image_idx} analyzed successfully")
             return analysis
         except Exception as e:
-            error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
-            print(f" Error analyzing image {image_idx}: {e}")
             return error_msg
     def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
-        """
-        Считывает изображения и отправляет на анализ
-        """
         visual_analyses = []
         for idx, image in enumerate(images):
             image_path = image.get('path', '')
             if not image_path:
-                print(f"  Image {idx}: No path")
                 continue
             visual_analysis = self.analyze_image_visually(image_path, idx)
             visual_analyses.append({
                 'type': 'image_visual',
                 'image_index': idx,
                 'image_path': image_path,
                 'visual_analysis': visual_analysis,
-                'ocr_text': image.get('ocr_text', '')
             })
         return visual_analyses
     def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
-        """
-        Отправляет куски текста на суммаризацию
-        """
         chunks = []
         text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
         self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
         for idx, chunk in enumerate(text_chunks):
             if len(chunk.strip()) < 50:
                 continue
             try:
-                prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующий кусок текста.
-Выдели основные моменты, факты и идеи. Будь краток.
-Текст :
 {chunk}
-Результат:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
                 summary = response.content.strip()
                 chunks.append({
                     'type': 'text_chunk',
                     'chunk_index': len(chunks),
@@ -179,40 +131,27 @@ class VisualMultimodalRAG:
                     'summary': summary,
                     'chunk_length': len(chunk)
                 })
                 if self.debug:
                     self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
             except Exception as e:
                 print(f"Error summarizing text chunk: {e}")
         return chunks
     def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
-        """
-        Отправляет таблицы на суммаризацию
-        """
         summaries = []
         for idx, table in enumerate(tables):
             table_content = table.get('content', '')
             if not table_content or len(table_content.strip()) < 10:
                 continue
             try:
-                prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующию таблицу.
-Выдели основные моменты, числа, и значения строк/колонок. Будь краток.
-Таблица:
 {table_content}
-Результат:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
                 summary = response.content.strip()
                 summaries.append({
                     'type': 'table',
                     'table_index': idx,
@@ -220,29 +159,21 @@ class VisualMultimodalRAG:
                     'summary': summary,
                     'table_length': len(table_content)
                 })
                 if self.debug:
                     self._debug_print(f"Table {idx} Summary", summary)
             except Exception as e:
                 print(f"Error summarizing table {idx}: {e}")
         return summaries
     def process_and_store_document(
-        self,
-        text: str,
         images: List[Dict],
         tables: List[Dict],
         vector_store,
         doc_id: str
     ) -> Dict:
-        """
-        Основной pipeline анализирует и сохраняет документы в хранилище
-        """
-        print(f"PROCESSING ANALYSIS: {doc_id}")
         results = {
             'doc_id': doc_id,
             'image_visual_analyses': [],
@@ -250,53 +181,42 @@ class VisualMultimodalRAG:
             'table_summaries': [],
             'total_stored': 0
         }
-        print(f"\n VISUAL IMAGE ANALYSIS ({len(images)} )")
         image_analyses = self.analyze_images_visually(images)
         results['image_visual_analyses'] = image_analyses
         image_docs = {
             'text': ' | '.join([
-                f"Image {a['image_index']}: {a['visual_analysis']}"
                 for a in image_analyses
             ]),
             'images': [],
             'tables': []
         }
         for analysis in image_analyses:
-            print(f"     Image {analysis['image_index']}")
-            print(f"     Path: {analysis['image_path']}")
-            print(f"     Analysis: {analysis['visual_analysis'][:100]}...")
         if image_analyses:
             try:
                 vector_store.add_documents(
-                    image_docs,
                     f"{doc_id}_images_visual"
                 )
                 results['total_stored'] += len(image_analyses)
-                print(f" Stored {len(image_analyses)} imagу analyses")
             except Exception as e:
                 print(f"Error storing image analyses: {e}")
-        print(f"\n TEXT CHUNK SUMMARIZATION")
         text_summaries = self.summarize_text_chunks(text)
         results['text_summaries'] = text_summaries
         text_docs = {
-            'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
-                               for s in text_summaries]),
             'images': [],
             'tables': []
         }
         for summary in text_summaries:
-            print(f"   Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
         if text_summaries:
             try:
                 vector_store.add_documents(
@@ -304,25 +224,20 @@ class VisualMultimodalRAG:
                     f"{doc_id}_text_chunks"
                 )
                 results['total_stored'] += len(text_summaries)
-                print(f" Stored {len(text_summaries)} text chunk summaries")
             except Exception as e:
-                print(f" Error text summaries: {e}")
-        print(f"\n TABLE SUMMARIZATION ({len(tables)}")
         table_summaries = self.summarize_tables(tables)
         results['table_summaries'] = table_summaries
         table_docs = {
-            'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
-                               for s in table_summaries]),
             'images': [],
             'tables': []
         }
         for summary in table_summaries:
-            print(f"  Table {summary['table_index']}: {summary['summary'][:50]}...")
         if table_summaries:
             try:
                 vector_store.add_documents(
@@ -330,17 +245,14 @@ class VisualMultimodalRAG:
                     f"{doc_id}_tables"
                 )
                 results['total_stored'] += len(table_summaries)
-                print(f" Stored {len(table_summaries)} table summaries")
             except Exception as e:
-                print(f" Error storing table summaries: {e}")
-        print(f" STORAGE SUMMARY")
-        print(f"  Images analyzed: {len(image_analyses)}")
-        print(f"  Text chunks summarized: {len(text_summaries)}")
-        print(f"  Tables summarized: {len(table_summaries)}")
-        print(f"  Total items stored in vector: {results['total_stored']}")
-        self.visual_summaries_log.append(results)
         return results
     def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
@@ -352,73 +264,41 @@ class VisualMultimodalRAG:
             start = end - overlap
         return chunks
-    def get_visual_summaries_log(self) -> List[Dict]:
-        return self.visual_summaries_log
 class AnsweringRAG:
-    """
-    RAG - работа с ответом на запрос:
-    1. Поиск в векторном хранилище
-    2. Анализ результатов
-    3. Предоставление ответа
-    """
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
-            model_name=OPENAI_MODEL,
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         )
         self.language = LANGUAGE
-        self.answer_log = []
         if self.debug:
-            print(" AnsweringRAG initialized")
     def _debug_print(self, label: str, data: any):
-        """Debug"""
         if self.debug:
-            print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
-                print(f"  Type: {type(data).__name__}")
-                print(f"  Content: {str(data)[:300]}...")
             else:
-                print(f"  {data}")
     def analyze_and_answer(
-        self,
-        question: str,
         search_results: List[Dict]
     ) -> Dict:
-        """
-        Проанализируй найденные документов и на основе их предоставь ответ на вопрос пользователя
-        Ответ:
-        {
-            'question': user question,
-            'answer': detailed answer,
-            'sources_used': number of sources,
-            'confidence': low/medium/high,
-            'search_results': original search results
-        }
-        """
-        print(f"ANALYZING QUESTION & GENERATING ANSWER")
-        print(f"\n Question: {question}")
-        print(f" Search Results: {len(search_results)}")
         if not search_results:
-            print(f" No search results found!")
-            answer = f"""Релевантная информация в документах отсутствует: "{question}"
 """
             result = {
                 'question': question,
                 'answer': answer,
@@ -426,72 +306,54 @@ class AnsweringRAG:
                 'confidence': 'low',
                 'search_results': []
             }
-            self.answer_log.append(result)
             return result
         context_parts = []
         for idx, result in enumerate(search_results, 1):
             content = result.get('content', '')
-            metadata = result.get('metadata', {})
             content_type = result.get('type', 'unknown')
             distance = result.get('distance', 0)
             relevance = 1 - distance if distance else 0
             context_parts.append(f"""
 [Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
 {content}""")
         full_context = "\n".join(context_parts)
-        self._debug_print("Context Prepared", f"{len(context_parts)} sources")
-        analysis_prompt = f"""Ты - ассистент по анализу документов и ответов на вопросы по ним.
-ВОПРОС:
 "{question}"
-РЕЛЕВАНТНАЯ ИНФОРМАЦИЯ:
 {full_context}
-ИНСТРУКЦИИ:
-1. Проанализируй предоставленный контент
-2. Выдели информацию имеющую отношение к вопросу
-3. Предоставь понятный и исчерпывающий ответ
-4. Если контент полностью не отвечает на вопрос предосавь информацию которая доступна в контенте
-5. Построй свой ответ опираясь на ключевые моменты
-Ответ:"""
-        print(f"\n Analyzing search results...")
-        print(f"   Context size: {len(full_context)} chars")
-        print(f"   Sources: {len(search_results)}")
         try:
             message = HumanMessage(content=analysis_prompt)
             response = self.llm.invoke([message])
             answer = response.content.strip()
             confidence = self._estimate_confidence(len(search_results), answer)
-            print(f" Answer generated successfully")
-            print(f"   Confidence: {confidence}")
-            print(f"   Answer length: {len(answer)} chars")
             result = {
                 'question': question,
                 'answer': answer,
                 'sources_used': len(search_results),
                 'confidence': confidence,
-                'search_results': search_results
             }
-            self.answer_log.append(result)
             return result
         except Exception as e:
-            print(f" Error generating answer: {e}")
-            answer = f"Error while analyzing the search results."
             result = {
                 'question': question,
                 'answer': answer,
@@ -500,19 +362,24 @@ class AnsweringRAG:
                 'error': str(e),
                 'search_results': search_results
             }
-            self.answer_log.append(result)
             return result
     def _estimate_confidence(self, sources_count: int, answer: str) -> str:
-        """Уверенность в ответе на основании найденных источников информации"""
         answer_length = len(answer)
         if sources_count >= 3 and answer_length > 500:
             return "high"
         elif sources_count >= 2 and answer_length > 200:
             return "medium"
         else:
             return "low"

 from typing import List, Dict
 from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage
 import base64
 import os
 from pathlib import Path
 from config import (
+    OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
     LANGUAGE, CHROMA_DB_PATH
 )
 class VisualMultimodalRAG:
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
+            model_name="gpt-4o-mini",
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         )
         self.language = LANGUAGE
         if self.debug:
+            print("VisualMultimodalRAG initialized")
     def _debug_print(self, label: str, data: any):
         if self.debug:
+            print(f"DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
+                print(f" Type: {type(data).__name__}")
+                print(f" Content: {str(data)[:300]}...")
             else:
+                print(f" {data}")
     def _image_to_base64(self, image_path: str) -> str:
         try:
             with open(image_path, 'rb') as image_file:
                 image_data = base64.b64encode(image_file.read()).decode('utf-8')
             return None
     def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
         if not os.path.exists(image_path):
+            return f"Image {image_idx}: File not found - {image_path}"
         try:
             image_base64 = self._image_to_base64(image_path)
             if not image_base64:
+                return f"Image {image_idx}: Could not convert to base64"
             file_ext = Path(image_path).suffix.lower()
             media_type_map = {
                 '.jpg': 'image/jpeg',
                 '.webp': 'image/webp'
             }
             media_type = media_type_map.get(file_ext, 'image/png')
+            print(f"Analyzing image {image_idx} visually (as {media_type})...")
             message = HumanMessage(
                 content=[
                     {
                     },
                     {
                         "type": "text",
+                        "text": f"""You are assistant for analyzing and aggregating information. Analyze this image.
+Provide a visual analysis that includes:
+1. Main objects and element
+2. Data/Content - Any numbers, text, charts, graphs
+3. What this image is showing or representing
+4. Important patterns, trends, or information
+5. How image relates to document content
+Be brief and meaningful. Focus on visual information that cannot be extracted from text. Response on {self.language}.
+Analysis:"""
                     }
                 ],
             )
             response = self.llm.invoke([message])
             analysis = response.content.strip()
             if self.debug:
                 self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
+            print(f"Image {image_idx} analyzed successfully")
             return analysis
         except Exception as e:
+            error_msg = f"Image {image_idx}: Vision analysis failed - {str(e)}"
+            print(f"Error analyzing image {image_idx}: {e}")
             return error_msg
     def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
         visual_analyses = []
         for idx, image in enumerate(images):
             image_path = image.get('path', '')
             if not image_path:
+                print(f"Image {idx}: No path provided")
                 continue
             visual_analysis = self.analyze_image_visually(image_path, idx)
             visual_analyses.append({
                 'type': 'image_visual',
                 'image_index': idx,
                 'image_path': image_path,
                 'visual_analysis': visual_analysis,
+                'ocr_text': image.get('ocr_text', '')
             })
         return visual_analyses
     def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
         chunks = []
         text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
         self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
         for idx, chunk in enumerate(text_chunks):
             if len(chunk.strip()) < 50:
                 continue
             try:
+                prompt = f"""Summarize this text chunk in {self.language}.
+Be brief and meaningful. Extract key points, facts, and main ideas.
+Text Chunk:
 {chunk}
+Summary:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
                 summary = response.content.strip()
                 chunks.append({
                     'type': 'text_chunk',
                     'chunk_index': len(chunks),
                     'summary': summary,
                     'chunk_length': len(chunk)
                 })
                 if self.debug:
                     self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
             except Exception as e:
                 print(f"Error summarizing text chunk: {e}")
         return chunks
     def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
         summaries = []
         for idx, table in enumerate(tables):
             table_content = table.get('content', '')
             if not table_content or len(table_content.strip()) < 10:
                 continue
             try:
+                prompt = f"""Analyze and summarize this table/structured data in {self.language}.
+Extract key insights, row/column meanings, and important figures. Be brief and meaningful.
+Table Content:
 {table_content}
+Summary:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
                 summary = response.content.strip()
                 summaries.append({
                     'type': 'table',
                     'table_index': idx,
                     'summary': summary,
                     'table_length': len(table_content)
                 })
                 if self.debug:
                     self._debug_print(f"Table {idx} Summary", summary)
             except Exception as e:
                 print(f"Error summarizing table {idx}: {e}")
         return summaries
     def process_and_store_document(
+        self,
+        text: str,
         images: List[Dict],
         tables: List[Dict],
         vector_store,
         doc_id: str
     ) -> Dict:
+        print("PROCESSING WITH VISUAL IMAGE ANALYSIS: " + doc_id)
         results = {
             'doc_id': doc_id,
             'image_visual_analyses': [],
             'table_summaries': [],
             'total_stored': 0
         }
+        print("VISUAL IMAGE ANALYSIS (" + str(len(images)) + " total)")
         image_analyses = self.analyze_images_visually(images)
         results['image_visual_analyses'] = image_analyses
         image_docs = {
             'text': ' | '.join([
+                f"Image {a['image_index']}: {a['visual_analysis']}"
                 for a in image_analyses
             ]),
             'images': [],
             'tables': []
         }
         for analysis in image_analyses:
+            print(f" Image {analysis['image_index']} (visual analysis)")
+            print(f" Path: {analysis['image_path']}")
+            print(f" Analysis: {analysis['visual_analysis'][:100]}...")
         if image_analyses:
             try:
                 vector_store.add_documents(
+                    image_docs,
                     f"{doc_id}_images_visual"
                 )
                 results['total_stored'] += len(image_analyses)
+                print(f"Stored {len(image_analyses)} image visual analyses")
             except Exception as e:
                 print(f"Error storing image analyses: {e}")
+        print("TEXT CHUNK SUMMARIZATION")
         text_summaries = self.summarize_text_chunks(text)
         results['text_summaries'] = text_summaries
         text_docs = {
+            'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
+            for s in text_summaries]),
             'images': [],
             'tables': []
         }
         for summary in text_summaries:
+            print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
         if text_summaries:
             try:
                 vector_store.add_documents(
                     f"{doc_id}_text_chunks"
                 )
                 results['total_stored'] += len(text_summaries)
+                print(f"Stored {len(text_summaries)} text chunk summaries")
             except Exception as e:
+                print(f"Error storing text summaries: {e}")
+        print("TABLE SUMMARIZATION (" + str(len(tables)) + " total)")
         table_summaries = self.summarize_tables(tables)
         results['table_summaries'] = table_summaries
         table_docs = {
+            'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
+            for s in table_summaries]),
             'images': [],
             'tables': []
         }
         for summary in table_summaries:
+            print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
         if table_summaries:
             try:
                 vector_store.add_documents(
                     f"{doc_id}_tables"
                 )
                 results['total_stored'] += len(table_summaries)
+                print(f"Stored {len(table_summaries)} table summaries")
             except Exception as e:
+                print(f"Error storing table summaries: {e}")
+        print("STORAGE SUMMARY")
+        print(f" Images analyzed and stored: {len(image_analyses)}")
+        print(f" Text chunks summarized and stored: {len(text_summaries)}")
+        print(f" Tables summarized and stored: {len(table_summaries)}")
+        print(f" Total items stored in vector: {results['total_stored']}")
         return results
     def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
             start = end - overlap
         return chunks
 class AnsweringRAG:
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
+            model_name="gpt-4o-mini",
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         )
         self.language = LANGUAGE
         if self.debug:
+            print("AnsweringRAG initialized")
     def _debug_print(self, label: str, data: any):
         if self.debug:
+            print(f"DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
+                print(f" Type: {type(data).__name__}")
+                print(f" Content: {str(data)[:300]}...")
             else:
+                print(f" {data}")
     def analyze_and_answer(
+        self,
+        question: str,
         search_results: List[Dict]
     ) -> Dict:
+        print("ANALYZING QUESTION & GENERATING ANSWER")
+        print(f"Question: {question}")
+        print(f"Search Results Found: {len(search_results)}")
         if not search_results:
+            print("No search results found!")
+            answer = f"""No relevant information in the document to answer question: "{question}"
 """
             result = {
                 'question': question,
                 'answer': answer,
                 'confidence': 'low',
                 'search_results': []
             }
             return result
         context_parts = []
         for idx, result in enumerate(search_results, 1):
             content = result.get('content', '')
             content_type = result.get('type', 'unknown')
             distance = result.get('distance', 0)
             relevance = 1 - distance if distance else 0
             context_parts.append(f"""
 [Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
 {content}""")
         full_context = "\n".join(context_parts)
+        self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
+        analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
+USER QUESTION:
 "{question}"
+RELEVANT CONTENT FROM DOCUMENT:
 {full_context}
+INSTRUCTIONS:
+1. Analyze the provided content carefully
+2. Extract information relevant to the question
+3. Synthesize a clear, comprehensive answer in {self.language}
+4. If the content doesn't fully answer the question, explain what information is available
+5. Be specific and cite the content when relevant
+6. Structure your answer clearly with key points
+ANSWER:"""
+        print("Analyzing search results...")
+        print(f" Context size: {len(full_context)} characters")
+        print(f" Sources: {len(search_results)}")
         try:
             message = HumanMessage(content=analysis_prompt)
             response = self.llm.invoke([message])
             answer = response.content.strip()
             confidence = self._estimate_confidence(len(search_results), answer)
+            print("Answer generated successfully")
+            print(f" Confidence: {confidence}")
+            print(f" Answer length: {len(answer)} characters")
             result = {
                 'question': question,
                 'answer': answer,
                 'sources_used': len(search_results),
                 'confidence': confidence,
+                'search_results': search_results,
+                'formatted_sources': self._format_sources(search_results)
             }
             return result
         except Exception as e:
+            print(f"Error generating answer: {e}")
+            answer = "Error while analyzing the search results"
             result = {
                 'question': question,
                 'answer': answer,
                 'error': str(e),
                 'search_results': search_results
             }
             return result
     def _estimate_confidence(self, sources_count: int, answer: str) -> str:
         answer_length = len(answer)
         if sources_count >= 3 and answer_length > 500:
             return "high"
         elif sources_count >= 2 and answer_length > 200:
             return "medium"
         else:
             return "low"
+    def _format_sources(self, search_results: List[Dict]) -> List[Dict]:
+        formatted_sources = []
+        for idx, source in enumerate(search_results, 1):
+            formatted_sources.append({
+                'index': idx,
+                'type': source.get('type', 'unknown'),
+                'content': source.get('content', ''),
+                'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
+            })
+        return formatted_sources

src/vector_store.py CHANGED Viewed

@@ -1,209 +1,88 @@
-"""
-Векторное хранилище и Эмбеддер"
-"""
 import os
-import json
 from typing import List, Dict
 import chromadb
-from sentence_transformers import SentenceTransformer
-import numpy as np
-from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
-class CLIPEmbedder:
-    """Эмбеддер"""
-    def __init__(self, model_name: str = EMBEDDING_MODEL):
-        print(f"Embedding model: {model_name}")
-        self.model = SentenceTransformer(model_name)
-        print(f"Model loaded successfully")
-    def embed(self, text: str) -> List[float]:
-        """Эмбеддинг для текста"""
-        try:
-            embedding = self.model.encode(text, convert_to_numpy=False)
-            return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
-        except Exception as e:
-            print(f"Error embedding text: {e}")
-            return [0.0] * EMBEDDING_DIM
-    def embed_batch(self, texts: List[str]) -> List[List[float]]:
-        """Эмбеддинг для текста"""
-        try:
-            embeddings = self.model.encode(texts, convert_to_numpy=False)
-            return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
-        except Exception as e:
-            print(f"Error embedding batch: {e}")
-            return [[0.0] * EMBEDDING_DIM] * len(texts)
 class VectorStore:
-    """Векторное хранилище"""
     def __init__(self):
-        self.persist_directory = CHROMA_DB_PATH
-        self.embedder = CLIPEmbedder()
-        print(f"\nInitializing ChromaDB: {self.persist_directory}")
         try:
-            self.client = chromadb.PersistentClient(
-                path=self.persist_directory
             )
-            print(f"ChromaDB initialized")
         except Exception as e:
-            print(f"Error initializing ChromaDB: {e}")
-            self.client = chromadb.PersistentClient(
-                path=self.persist_directory
-            )
-        try:
-            self.collection = self.client.get_or_create_colletion(
-                name="multimodal_rag",
-                metadata={"hnsw:space": "cosine"}
-            )
-            count = self.collection.count()
-            print(f"Collection loaded: {count} items in store")
-        except Exception as e:
-            print(f"Error with collection: {e}")
-            self.collection = self.client.get_or_create_collection(
-                name="multimodal_rag"
-            )
-    def add_documents(self, documents: List[Dict], doc_id: str):
-        """Добавление документов в векторное хранилище"""
-        texts = []
-        metadatas = []
-        ids = []
-        print(f"\nAdding document: {doc_id}")
-        if 'text' in documents and documents['text']:
-            chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
-            for idx, chunk in enumerate(chunks):
-                texts.append(chunk)
-                metadatas.append({
-                    'doc_id': doc_id,
-                    'type': 'text',
-                    'chunk_idx': str(idx)
-                })
-                ids.append(f"{doc_id}_text_{idx}")
-            print(f"  Text: {len(chunks)} chunks")
-        if 'images' in documents:
-            image_count = 0
-            for idx, image_data in enumerate(documents['images']):
-                if image_data.get('ocr_text'):
-                    texts.append(f"Image {idx}: {image_data['ocr_text']}")
-                    metadatas.append({
-                        'doc_id': doc_id,
-                        'type': 'image',
-                        'image_idx': str(idx),
-                        'image_path': image_data.get('path', '')
-                    })
-                    ids.append(f"{doc_id}_image_{idx}")
-                    image_count += 1
-            if image_count > 0:
-                print(f"  Images: {image_count} with OCR text")
-        if 'tables' in documents:
-            table_count = 0
-            for idx, table_data in enumerate(documents['tables']):
-                if table_data.get('content'):
-                    texts.append(f"Table {idx}: {table_data.get('content', '')}")
-                    metadatas.append({
-                        'doc_id': doc_id,
-                        'type': 'table',
-                        'table_idx': str(idx)
-                    })
-                    ids.append(f"{doc_id}_table_{idx}")
-                    table_count += 1
-            if table_count > 0:
-                print(f"  Tables: {table_count}")
-        if texts:
-            print(f"  🔄 Generating {len(texts)} embeddings...")
-            embeddings = self.embedder.embed_batch(texts)
-            try:
-                self.collection.add(
-                    ids=ids,
-                    documents=texts,
-                    embeddings=embeddings,
-                    metadatas=metadatas
-                )
-                print(f"Successfully added {len(texts)} items to vector store")
-            except Exception as e:
-                print(f"Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
-        """Поиск в векторном хранилище"""
         try:
-            query_embedding = self.embedder.embed(query)
             results = self.collection.query(
-                query_embeddings=[query_embedding],
-                n_results=n_results
             )
             formatted_results = []
-            if results['documents']:
-                for i, doc in enumerate(results['documents'][0]):
-                    metadata = results['metadatas'][0][i] if results['metadatas'] else {}
-                    distance = results['distances'][0][i] if results['distances'] else 0
                     formatted_results.append({
                         'content': doc,
-                        'metadata': metadata,
                         'distance': distance,
-                        'type': metadata.get('type', 'unknown')
                     })
             return formatted_results
         except Exception as e:
             print(f"Error searching vector store: {e}")
             return []
-    def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
-        """Сплит текста"""
-        chunks = []
-        start = 0
-        while start < len(text):
-            end = start + chunk_size
-            chunks.append(text[start:end])
-            start = end - overlap
-        return chunks
     def get_collection_info(self) -> Dict:
-        """Получение информации о коллекции в вектороном хранилище"""
         try:
             count = self.collection.count()
             return {
-                'name': 'multimodal_rag',
                 'count': count,
-                'status': 'active',
-                'persist_path': self.persist_directory
             }
         except Exception as e:
             print(f"Error getting collection info: {e}")
-            return {'status': 'error', 'message': str(e)}
-    def delete_by_doc_id(self, doc_id: str):
-        """Удаление документа из векторного хранилища"""
-        try:
-            results = self.collection.get(where={'doc_id': doc_id})
-            if results['ids']:
-                self.collection.delete(ids=results['ids'])
-                print(f"Deleted {len(results['ids'])} documents for {doc_id}")
-        except Exception as e:
-            print(f"Error deleting documents: {e}")
     def clear_all(self):
-        """Очистка хранилища"""
         try:
-            self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
-                name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
-            print("Collection cleared")
         except Exception as e:
-            print(f"Error clearing collection: {e}")

 import os
 from typing import List, Dict
+from chromadb.config import Settings
 import chromadb
+from config import CHROMA_DB_PATH
 class VectorStore:
     def __init__(self):
+        self.chroma_path = CHROMA_DB_PATH
+        self.settings = Settings(
+            chroma_db_impl_embed_collection_mixin=True,
+            persist_directory=self.chroma_path,
+            anonymized_telemetry=False,
+            allow_reset=True,
+        )
+        self.client = chromadb.Client(self.settings)
+        self.collection = self.client.get_or_create_collection(
+            name="documents",
+            metadata={"hnsw:space": "cosine"}
+        )
+    def add_documents(self, documents: Dict, doc_id: str):
         try:
+            text = documents.get('text', '')
+            if not text or len(text.strip()) < 1:
+                print(f"Empty text for {doc_id}")
+                return
+            self.collection.add(
+                ids=[doc_id],
+                documents=[text],
+                metadatas=[{
+                    'doc_id': doc_id,
+                    'source': 'pdf_document'
+                }]
             )
+            print(f"Added document to vector store: {doc_id}")
         except Exception as e:
+            print(f"Error adding documents to vector store: {e}")
+            raise
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
         try:
             results = self.collection.query(
+                query_texts=[query],
+                n_results=n_results,
+                include=['documents', 'metadatas', 'distances', 'embeddings']
             )
             formatted_results = []
+            if results and results['documents'] and len(results['documents']) > 0:
+                for idx, doc in enumerate(results['documents'][0]):
+                    distance = results['distances'][0][idx] if results['distances'] else 0
                     formatted_results.append({
                         'content': doc,
+                        'metadata': results['metadatas'][0][idx] if results['metadatas'] else {},
                         'distance': distance,
+                        'type': 'document'
                     })
             return formatted_results
         except Exception as e:
             print(f"Error searching vector store: {e}")
             return []
     def get_collection_info(self) -> Dict:
         try:
             count = self.collection.count()
             return {
                 'count': count,
+                'status': 'ready',
+                'persist_path': self.chroma_path
             }
         except Exception as e:
             print(f"Error getting collection info: {e}")
+            return {
+                'count': 0,
+                'status': 'error',
+                'persist_path': self.chroma_path
+            }
     def clear_all(self):
         try:
+            self.client.delete_collection(name="documents")
             self.collection = self.client.get_or_create_collection(
+                name="documents",
                 metadata={"hnsw:space": "cosine"}
             )
+            print("Vector store cleared")
         except Exception as e:
+            print(f"Error clearing vector store: {e}")