final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 19, 2025

Commit

b802cc4

1 Parent(s): e00eeca

Simplify

Browse files

Files changed (5) hide show

src/app.py +77 -209
src/config.py +10 -18
src/pdf_parser.py +26 -48
src/rag_system.py +89 -187
src/vector_store.py +26 -50

src/app.py CHANGED Viewed

@@ -1,33 +1,25 @@
 """
-Multimodal RAG LLM System - Streamlit App
-Complete working version with VISUAL image analysis using gpt-4o
 """
 import streamlit as st
 import os
 from pathlib import Path
-# Import optimized versions
 from pdf_parser import PDFParser
 from vector_store import VectorStore
-from rag_system import VisualMultimodalRAG  # NEW - Vision model
 from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
-# ============================================================================
-# PAGE CONFIGURATION
-# ============================================================================
 st.set_page_config(
-    page_title="📄 Multimodal RAG LLM System",
-    page_icon="🤖",
     layout="wide",
     initial_sidebar_state="expanded"
 )
-# ============================================================================
-# SESSION STATE INITIALIZATION
-# ============================================================================
 if 'api_key_set' not in st.session_state:
     st.session_state.api_key_set = False
@@ -35,7 +27,7 @@ if 'api_key_set' not in st.session_state:
 if 'api_key' not in st.session_state:
     st.session_state.api_key = None
-if 'visual_rag_system' not in st.session_state:  # NEW - Vision model
     st.session_state.visual_rag_system = None
 if 'vector_store' not in st.session_state:
@@ -56,39 +48,28 @@ if 'current_images' not in st.session_state:
 if 'current_tables' not in st.session_state:
     st.session_state.current_tables = None
-if 'processing_results' not in st.session_state:  # NEW
     st.session_state.processing_results = None
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
-# ============================================================================
-# MAIN HEADER
-# ============================================================================
-st.title("📄 Multimodal RAG LLM System")
 st.markdown("""
-Process PDF documents with visual image analysis:
-- **PDF Parser** with OCR for Russian & English
-- **Visual Analysis** (gpt-4o) for image understanding
-- **Vector Store** (ChromaDB) for semantic search
-- **Individual Component** summarization and storage
 """)
-# ============================================================================
-# SIDEBAR - CONFIGURATION
-# ============================================================================
 with st.sidebar:
-    st.header("⚙️ Configuration")
-    # API Key Section
-    st.subheader("🔑 OpenAI API Key")
     api_key = st.text_input(
-        "Enter your OpenAI API key:",
         type="password",
         key="api_key_input"
     )
@@ -97,62 +78,53 @@ with st.sidebar:
         st.session_state.api_key = api_key
         st.session_state.api_key_set = True
-        # Initialize RAG systems if not already done
         if st.session_state.visual_rag_system is None:
             try:
                 st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)  # NEW
                 st.session_state.vector_store = VectorStore()
                 st.session_state.parser = PDFParser(debug=True)
-                st.success("✅ API Key set & systems initialized")
             except Exception as e:
-                st.error(f"Error initializing systems: {e}")
     else:
         st.session_state.api_key_set = False
-        st.warning("⚠️ Please enter your API key to continue")
     st.divider()
-    # Vector Store Status
-    st.subheader("📊 Vector Store Status")
     if st.session_state.vector_store:
         try:
             info = st.session_state.vector_store.get_collection_info()
-            st.metric("Items in Store", info['count'])
-            st.metric("Status", info['status'])
-            st.caption(f"Path: {info['persist_path']}")
         except Exception as e:
-            st.error(f"Error getting store info: {e}")
     else:
-        st.info("Set API key to initialize vector store")
     st.divider()
-    # Document Management
-    st.subheader("📁 Document Management")
-    if st.button("🔄 Clear Vector Store"):
         if st.session_state.vector_store:
             try:
                 st.session_state.vector_store.clear_all()
-                st.success("✅ Vector store cleared")
             except Exception as e:
-                st.error(f"Error clearing store: {e}")
-# ============================================================================
-# MAIN CONTENT
-# ============================================================================
-# Upload Section
-st.header("📤 Upload PDF Document")
 uploaded_file = st.file_uploader(
-    "Choose a PDF file",
     type=['pdf'],
-    help="PDF with text, images, and tables"
 )
 if uploaded_file is not None:
-    # Save uploaded file
     upload_path = Path(UPLOAD_FOLDER)
     upload_path.mkdir(exist_ok=True)
@@ -160,91 +132,64 @@ if uploaded_file is not None:
     with open(file_path, 'wb') as f:
         f.write(uploaded_file.getbuffer())
-    st.success(f"✅ File saved: {uploaded_file.name}")
-    # Parse PDF
-    if st.button("🔍 Parse PDF"):
         if not st.session_state.api_key_set:
-            st.error("❌ Please set OpenAI API key first")
         else:
             try:
-                with st.spinner("📄 Parsing PDF..."):
-                    print(f"\n{'='*70}")
-                    print(f"PARSING: {uploaded_file.name}")
-                    print(f"{'='*70}")
-                    # Parse PDF - returns text, images, tables
                     parser = st.session_state.parser
                     text, images, tables = parser.parse_pdf(str(file_path))
-                    # Store in session state
                     st.session_state.current_document = uploaded_file.name
                     st.session_state.current_text = text
                     st.session_state.current_images = images
                     st.session_state.current_tables = tables
-                    # Display results
                     col1, col2, col3 = st.columns(3)
                     with col1:
-                        st.metric("📝 Text", f"{len(text):,} chars")
                     with col2:
-                        st.metric("🖼️ Images", len(images))
                     with col3:
-                        st.metric("📋 Tables", len(tables))
-                    # Show image OCR details
-                    if images:
-                        st.subheader("🖼️ Extracted Images")
-                        for idx, img in enumerate(images):
-                            ocr_text = img.get('ocr_text', '')
-                            ocr_len = len(ocr_text)
-                            if ocr_len > 0:
-                                st.success(f"✅ Image {idx}: {ocr_len} characters (OCR)")
-                            else:
-                                st.warning(f"⚠️ Image {idx}: No OCR text (will use visual analysis)")
-                    st.success("✅ PDF parsing complete!")
             except Exception as e:
-                st.error(f"❌ Error parsing PDF: {e}")
-                print(f"Error: {e}")
-# ============================================================================
-# VISUAL IMAGE ANALYSIS & COMPONENT STORAGE
-# ============================================================================
 st.divider()
-st.header("🖼️ Visual Analysis & Storage")
 st.info("""
-**How it works:**
-1. Images are sent to gpt-4o for visual analysis (not just text OCR)
-2. Text is split into chunks and each chunk is summarized
-3. Tables are analyzed individually
-4. ALL summaries are stored in the vector store for semantic search
 """)
-if st.button("🖼️ Analyze Images Visually & Store Components"):
     if not st.session_state.api_key_set:
-        st.error("❌ Please set OpenAI API key first")
     elif st.session_state.current_text is None:
-        st.error("❌ Please parse a PDF document first")
     else:
         try:
-            with st.spinner("🖼️ Analyzing images visually with gpt-4o..."):
-                print(f"\n{'='*70}")
-                print(f"VISUAL IMAGE ANALYSIS")
-                print(f"{'='*70}")
-                # Process with visual analysis
                 visual_rag = st.session_state.visual_rag_system
                 vector_store = st.session_state.vector_store
                 results = visual_rag.process_and_store_document(
                     text=st.session_state.current_text,
-                    images=st.session_state.current_images,    # Actual images sent to gpt-4o
                     tables=st.session_state.current_tables,
                     vector_store=vector_store,
                     doc_id=st.session_state.current_document or "current_doc"
@@ -252,107 +197,55 @@ if st.button("🖼️ Analyze Images Visually & Store Components"):
                 st.session_state.processing_results = results
-                # Display results
-                st.success("✅ Visual analysis complete & stored!")
                 col1, col2, col3 = st.columns(3)
                 with col1:
-                    st.metric("🖼️ Images Analyzed", len(results['image_visual_analyses']))
                 with col2:
-                    st.metric("📝 Text Chunks", len(results['text_summaries']))
                 with col3:
-                    st.metric("📋 Tables Analyzed", len(results['table_summaries']))
-                st.metric("📊 Total Stored in Vector", results['total_stored'])
-                # Show image visual analyses
-                if results['image_visual_analyses']:
-                    st.subheader("🖼️ Visual Image Analyses (gpt-4o)")
-                    for img_analysis in results['image_visual_analyses']:
-                        with st.expander(f"Image {img_analysis['image_index']} - Visual Analysis"):
-                            st.write("**Visual Analysis by gpt-4o:**")
-                            st.write(img_analysis['visual_analysis'])
-                            st.write("**Image Path:**")
-                            st.code(img_analysis['image_path'])
-                            if img_analysis['ocr_text']:
-                                st.write("**OCR Text (backup):**")
-                                st.text(img_analysis['ocr_text'][:500])
-                # Show text chunk summaries
-                if results['text_summaries']:
-                    st.subheader("📝 Text Chunk Summaries")
-                    for chunk_summary in results['text_summaries']:
-                        with st.expander(
-                            f"Chunk {chunk_summary['chunk_index']} "
-                            f"({chunk_summary['chunk_length']} chars)"
-                        ):
-                            st.write("**Summary:**")
-                            st.write(chunk_summary['summary'])
-                            st.write("**Original Text (first 500 chars):**")
-                            st.text(chunk_summary['original_text'])
-                # Show table analyses
-                if results['table_summaries']:
-                    st.subheader("📋 Table Analyses")
-                    for table_summary in results['table_summaries']:
-                        with st.expander(
-                            f"Table {table_summary['table_index']} "
-                            f"({table_summary['table_length']} chars)"
-                        ):
-                            st.write("**Analysis:**")
-                            st.write(table_summary['summary'])
-                            st.write("**Original Content (first 500 chars):**")
-                            st.text(table_summary['original_content'])
-                print(f"\n✅ Visual analysis processing complete!")
         except Exception as e:
-            st.error(f"❌ Error during visual analysis: {e}")
-            print(f"Error: {e}")
-# ============================================================================
-# QUESTION & ANSWERING
-# ============================================================================
 st.divider()
-st.header("❓ Ask Questions About Document")
-# Initialize answering system if not done
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
-# Create answering system when API key is set
 if st.session_state.api_key_set and st.session_state.answering_rag is None:
     from rag_system import AnsweringRAG
     st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
 question = st.text_area(
-    "Enter your question:",
     height=100,
-    placeholder="What does the document say about...?"
 )
-if st.button("🔍 Search & Generate Answer"):
     if not st.session_state.api_key_set:
-        st.error("❌ Please set OpenAI API key first")
     elif st.session_state.current_text is None:
-        st.error("❌ Please parse a PDF document first")
     elif not question:
-        st.error("❌ Please enter a question")
     else:
         try:
-            with st.spinner("🔄 Searching document and analyzing..."):
-                print(f"\n{'='*70}")
-                print(f"QUESTION: {question}")
-                print(f"{'='*70}")
-                # Search vector store
                 store = st.session_state.vector_store
-                # Add documents to store if needed
                 doc_name = st.session_state.current_document or "current_doc"
                 doc_data = {
                     'text': st.session_state.current_text,
@@ -361,21 +254,17 @@ if st.button("🔍 Search & Generate Answer"):
                 }
                 store.add_documents(doc_data, doc_name)
-                # Search for relevant results
                 search_results = store.search(question, n_results=5)
-                print(f"\n📊 Search Results Found: {len(search_results)}")
-                # Analyze results and generate answer
                 answering_rag = st.session_state.answering_rag
                 result = answering_rag.analyze_and_answer(question, search_results)
-                # Display answer prominently
-                st.success("✅ Analysis complete!")
-                st.subheader("📝 Answer")
-                # Show confidence level
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     confidence_color = {
@@ -383,56 +272,35 @@ if st.button("🔍 Search & Generate Answer"):
                         'medium': '🟡',
                         'low': '🔴'
                     }.get(result['confidence'], '⚪')
-                    st.metric("Confidence", f"{confidence_color} {result['confidence'].upper()}")
                 with col2:
-                    st.metric("Sources Used", result['sources_used'])
                 with col3:
                     if result['sources_used'] > 0:
-                        st.metric("Avg Relevance", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
-                # Display the generated answer
                 st.write(result['answer'])
-                # Show sources
-                if st.checkbox("📚 Show Source Documents"):
-                    st.subheader("Sources Used in Answer")
                     for idx, source in enumerate(result['formatted_sources'], 1):
                         relevance = source['relevance']
-                        relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
                         with st.expander(
-                            f"Source {idx} - {source['type'].upper()} "
                             f"[{relevance_bar}] {relevance:.0%}"
                         ):
                             st.write(source['content'])
-                print(f"\n✅ Answer generation complete!")
         except Exception as e:
-            st.error(f"❌ Error processing question: {e}")
-            print(f"Error: {e}")
-# ============================================================================
-# FOOTER
-# ============================================================================
 st.divider()
-col1, col2, col3 = st.columns(3)
-with col1:
-    st.info("📖 **Text Processing**: PyPDF2 extraction with UTF-8 support")
-with col2:
-    st.info("🖼️ **Visual Analysis**: GPT-4o vision for image understanding")
-with col3:
-    st.info("📊 **Vector Storage**: ChromaDB with auto-persist")
 st.caption(
-    "Multimodal RAG System | "
-    "Visual Image Analysis | "
-    "Russian Language Support | "
-    "Individual Component Summarization"
 )

 """
+UI RAG
 """
 import streamlit as st
 import os
 from pathlib import Path
 from pdf_parser import PDFParser
 from vector_store import VectorStore
+from rag_system import VisualMultimodalRAG
 from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
 st.set_page_config(
+    page_title="Мультимодальная RAG система (PDF parsing)",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 if 'api_key_set' not in st.session_state:
     st.session_state.api_key_set = False
 if 'api_key' not in st.session_state:
     st.session_state.api_key = None
+if 'visual_rag_system' not in st.session_state:
     st.session_state.visual_rag_system = None
 if 'vector_store' not in st.session_state:
 if 'current_tables' not in st.session_state:
     st.session_state.current_tables = None
+if 'processing_results' not in st.session_state:
     st.session_state.processing_results = None
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
+st.title("Мультимодальная RAG система (PDF parsing)")
 st.markdown("""
+Обрабатывает PDF документы и предоставляет информацию по ним
 """)
 with st.sidebar:
+    st.header(" Конфигурация")
+    st.subheader(" OpenAI API Ключ")
     api_key = st.text_input(
+        "Введите OpenAI API ключ:",
         type="password",
         key="api_key_input"
     )
         st.session_state.api_key = api_key
         st.session_state.api_key_set = True
         if st.session_state.visual_rag_system is None:
             try:
                 st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)  # NEW
                 st.session_state.vector_store = VectorStore()
                 st.session_state.parser = PDFParser(debug=True)
+                st.success("API ключ введен")
             except Exception as e:
+                st.error(f"Ошибка старта системы: {e}")
     else:
         st.session_state.api_key_set = False
+        st.warning("Введите OpenAI API ключ")
     st.divider()
+    st.subheader("Векторное хранилище")
     if st.session_state.vector_store:
         try:
             info = st.session_state.vector_store.get_collection_info()
+            st.metric("Документов в хранилище", info['count'])
+            st.caption(f"Расположение: {info['persist_path']}")
         except Exception as e:
+            st.error(f"Ошибка получения информации: {e}")
     else:
+        st.info("Введите OpenAI API ключ")
     st.divider()
+    st.subheader("Управление хранилищем")
+    if st.button("Очистить хранилище"):
         if st.session_state.vector_store:
             try:
                 st.session_state.vector_store.clear_all()
+                st.success("Хранилище очищено")
             except Exception as e:
+                st.error(f"Ошибка очистки хранилища: {e}")
+st.header("Загрузить PDF")
 uploaded_file = st.file_uploader(
+    "Выбрать...",
     type=['pdf'],
+    help="Загрузите PDF файл"
 )
 if uploaded_file is not None:
     upload_path = Path(UPLOAD_FOLDER)
     upload_path.mkdir(exist_ok=True)
     with open(file_path, 'wb') as f:
         f.write(uploaded_file.getbuffer())
+    st.success(f"Файл загружен: {uploaded_file.name}")
+    if st.button("Распарсить PDF"):
         if not st.session_state.api_key_set:
+            st.error("Введите OpenAI API ключ")
         else:
             try:
+                with st.spinner(" Парсинг PDF..."):
+                    print(f"Парсинг PDF файла: {uploaded_file.name}")
                     parser = st.session_state.parser
                     text, images, tables = parser.parse_pdf(str(file_path))
                     st.session_state.current_document = uploaded_file.name
                     st.session_state.current_text = text
                     st.session_state.current_images = images
                     st.session_state.current_tables = tables
                     col1, col2, col3 = st.columns(3)
                     with col1:
+                        st.metric("Текста", f"{len(text):,} chars")
                     with col2:
+                        st.metric("Изображений", len(images))
                     with col3:
+                        st.metric("Таблиц", len(tables))
+                    st.success("Парсинг PDF завершен!")
             except Exception as e:
+                st.error(f"Парсинг PDF завершелся с ошибкой: {e}")
+                print(f"Ошибка: {e}")
 st.divider()
+st.header("Анализ документа")
 st.info("""
+Отправляет содержимое документа на анализ
 """)
+if st.button("Проанализировать документ"):
     if not st.session_state.api_key_set:
+        st.error("Введите OpenAI API ключ")
     elif st.session_state.current_text is None:
+        st.error("Распарсите документ")
     else:
         try:
+            with st.spinner("Анализ с gpt-4o-mini..."):
                 visual_rag = st.session_state.visual_rag_system
                 vector_store = st.session_state.vector_store
                 results = visual_rag.process_and_store_document(
                     text=st.session_state.current_text,
+                    images=st.session_state.current_images,
                     tables=st.session_state.current_tables,
                     vector_store=vector_store,
                     doc_id=st.session_state.current_document or "current_doc"
                 st.session_state.processing_results = results
+                st.success("Анализ готов!")
                 col1, col2, col3 = st.columns(3)
                 with col1:
+                    st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
                 with col2:
+                    st.metric("Проанализировано чанков текста", len(results['text_summaries']))
                 with col3:
+                    st.metric("Проанализировано таблиц", len(results['table_summaries']))
+                st.metric("Помещено в хранилище", results['total_stored'])
+                print(f"Анализ завершен")
         except Exception as e:
+            st.error(f"Ошибка в ходе: {e}")
+            print(f"Ошибка: {e}")
 st.divider()
+st.header("Работа с документом")
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
 if st.session_state.api_key_set and st.session_state.answering_rag is None:
     from rag_system import AnsweringRAG
     st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
 question = st.text_area(
+    "Введите запрос:",
     height=100,
+    placeholder="О чем данный документ?"
 )
+if st.button("Генерация ответа"):
     if not st.session_state.api_key_set:
+        st.error("Введите OpenAI API ключ")
     elif st.session_state.current_text is None:
+        st.error("Распарсите документ")
     elif not question:
+        st.error("Введите запрос")
     else:
         try:
+            with st.spinner("Поиск документов..."):
                 store = st.session_state.vector_store
                 doc_name = st.session_state.current_document or "current_doc"
                 doc_data = {
                     'text': st.session_state.current_text,
                 }
                 store.add_documents(doc_data, doc_name)
                 search_results = store.search(question, n_results=5)
+                print(f"Найдено: {len(search_results)}")
                 answering_rag = st.session_state.answering_rag
                 result = answering_rag.analyze_and_answer(question, search_results)
+                st.success("Поиск завершен!")
+                st.subheader("Ответ")
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     confidence_color = {
                         'medium': '🟡',
                         'low': '🔴'
                     }.get(result['confidence'], '⚪')
+                    st.metric("Уверенность в ответе", f"{confidence_color} {result['confidence'].upper()}")
                 with col2:
+                    st.metric("Использовано источников", result['sources_used'])
                 with col3:
                     if result['sources_used'] > 0:
+                        st.metric("Среднняя релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
                 st.write(result['answer'])
+                if st.checkbox("Показать исходные документы"):
+                    st.subheader("Использованы докуме��ты")
                     for idx, source in enumerate(result['formatted_sources'], 1):
                         relevance = source['relevance']
+                        relevance_bar = "\/" * int(relevance * 10) + "|" * (10 - int(relevance * 10))
                         with st.expander(
+                            f"Источник {idx} - {source['type'].upper()} "
                             f"[{relevance_bar}] {relevance:.0%}"
                         ):
                             st.write(source['content'])
+                print(f" Ответ готов!")
         except Exception as e:
+            st.error(f"Ошибка обработки запроса: {e}")
+            print(f"Ошибка: {e}")
 st.divider()
 st.caption(
+    "Мультимодальная RAG система для парсинга PDF документов"
 )

src/config.py CHANGED Viewed

@@ -1,42 +1,34 @@
 """
-Configuration file for Multimodal RAG LLM System
 """
 import os
 from pathlib import Path
-# API Configuration
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
-OPENAI_MODEL = "gpt-4o-mini"  # Cheaper model variant
-USE_CACHE = True  # Enable response caching
-# Vector Store Configuration
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
-# Embedding Model Configuration
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
-# System Configuration
-MAX_CHUNK_SIZE = 500  # Smaller chunks = fewer tokens
-CHUNK_OVERLAP = 50    # Less overlap = fewer chunks
-TEMPERATURE = 0.3     # Lower = faster, cheaper
-MAX_TOKENS = 500      # Limit response size (vs 1500)
-# Language Support
 LANGUAGE = "russian"
-# Create necessary directories
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
-# PDF Upload Configuration
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
 MAX_PDF_SIZE_MB = 50
-# TOKEN OPTIMIZATION SETTINGS
-BATCH_SEARCH_RESULTS = 3  # Return only top 3 (not 5)
-CACHE_RESPONSES = True    # Cache Q&A responses
-SUMMARIZE_FIRST = True    # Summarize PDFs once, not per query

 """
+Конфигурационный файл
 """
 import os
 from pathlib import Path
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+OPENAI_MODEL = "gpt-4o-mini"
+USE_CACHE = True
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
+MAX_CHUNK_SIZE = 500
+CHUNK_OVERLAP = 50
+TEMPERATURE = 0.3
+MAX_TOKENS = 500
 LANGUAGE = "russian"
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
 MAX_PDF_SIZE_MB = 50
+BATCH_SEARCH_RESULTS = 3
+CACHE_RESPONSES = True
+SUMMARIZE_FIRST = True

src/pdf_parser.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-PDF Parser Module with FIXED Russian OCR support
 """
 import os
 import json
@@ -20,27 +20,14 @@ class PDFParser:
         self.processed_files = self._load_processed_files()
         self.debug = debug
-        # Configure Tesseract for Russian + English
         self._configure_tesseract()
         if self.debug:
-            print("✅ PDFParser initialized with Russian OCR support")
-    def _configure_tesseract(self):
-        """Configure Tesseract with proper paths and language support"""
-        try:
-            # Windows specific path
-            if os.name == 'nt':
-                pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-            # Test Tesseract
-            pytesseract.get_tesseract_version()
-            print("✅ Tesseract configured successfully")
-        except Exception as e:
-            print(f"⚠️  Tesseract configuration warning: {e}")
     def _debug_print(self, label: str, data: any):
-        """Print debug information"""
         if self.debug:
             print(f"\n🔍 [PDF Parser] {label}")
             if isinstance(data, dict):
@@ -54,7 +41,7 @@ class PDFParser:
                 print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
-        """Load list of already processed files with their hashes"""
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -64,12 +51,12 @@ class PDFParser:
         return {}
     def _save_processed_files(self):
-        """Save processed files list to disk"""
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
-        """Generate hash of file to detect changes"""
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
@@ -77,7 +64,7 @@ class PDFParser:
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
-        """Extract text from PDF using PyPDF2"""
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
@@ -96,40 +83,36 @@ class PDFParser:
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Extract images from PDF pages with Russian OCR support"""
         images_data = []
         try:
-            self._debug_print("Image Extraction Started", f"File: {pdf_path}")
             images = convert_from_path(pdf_path, dpi=150)
-            self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
             for idx, image in enumerate(images):
-                self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
-                # Save image
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
-                # Extract text using OCR with Russian support
-                self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
                 try:
-                    # CRITICAL: Use 'rus+eng' for Russian + English support
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
-                    # Clean up text
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
-                        self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
                     else:
-                        self._debug_print(f"Image {idx} OCR Result", f"✅ Success - {len(ocr_text)} chars: {ocr_text[:150]}")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
-                    ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
                 images_data.append({
                     'page': idx,
@@ -144,13 +127,13 @@ class PDFParser:
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Extract table content from PDF"""
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
             lines = text.split('\n')
-            self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
             current_table = []
             for line in lines:
@@ -177,44 +160,39 @@ class PDFParser:
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
-        """Parse PDF and extract text, images, and tables with debug output"""
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
-        self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
-        # Check if file was already processed
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
-                self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
                 return self._load_extracted_data(doc_id)
-        print(f"\n📄 Processing PDF: {doc_id}")
-        # Extract content
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
-        # Summary
-        self._debug_print("Extraction Summary", {
             'text_length': len(text),
             'images_count': len(images),
             'tables_count': len(tables),
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
-        # Save extracted data
         self._save_extracted_data(doc_id, text, images, tables)
-        # Update processed files log
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
-        """Save extracted data to docstore"""
         data = {
             'text': text,
             'images': images,
@@ -227,7 +205,7 @@ class PDFParser:
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
-        """Load previously extracted data from docstore"""
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
@@ -237,7 +215,7 @@ class PDFParser:
             return "", [], []
     def get_all_documents(self) -> Dict:
-        """Load all processed documents from docstore"""
         all_docs = {}
         for json_file in self.docstore_path.glob("*_data.json"):
             doc_id = json_file.stem.replace("_data", "")

 """
+PDF Парсер
 """
 import os
 import json
         self.processed_files = self._load_processed_files()
         self.debug = debug
         self._configure_tesseract()
         if self.debug:
+            print("PDFParser initialized")
     def _debug_print(self, label: str, data: any):
+        """Debug"""
         if self.debug:
             print(f"\n🔍 [PDF Parser] {label}")
             if isinstance(data, dict):
                 print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
+        """Подгрузка обработанных файлов"""
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
         return {}
     def _save_processed_files(self):
+        """Сохранение обработанных файлов"""
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
+        """Проверка изменения файлов"""
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
+        """Извлечение текста из PDF"""
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
+        """Извлечение изображений из PDF"""
         images_data = []
         try:
+            self._debug_print("Image extraction", f"File: {pdf_path}")
             images = convert_from_path(pdf_path, dpi=150)
+            self._debug_print(f"Total images: {len(images)}")
             for idx, image in enumerate(images):
+                self._debug_print(f"Image {idx}", f"Size: {image.size}")
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
+                self._debug_print(f"Image {idx} OCR", "Running OCR...")
                 try:
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
+                        self._debug_print(f"Image {idx} OCR Result", f"WARN ({len(ocr_text)} chars)")
                     else:
+                        self._debug_print(f"Image {idx} OCR Result", f"SUCCESS {len(ocr_text)} chars: {ocr_text[:150]}")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
+                    ocr_text = f"[Image {idx}: OCR failed {str(ocr_error)}]"
                 images_data.append({
                     'page': idx,
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
+        """Извлечение таблиц из PDF"""
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
             lines = text.split('\n')
+            self._debug_print("Table extraction", f"Scanning {len(lines)} lines")
             current_table = []
             for line in lines:
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
+        """Парсинг PDF"""
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
+        self._debug_print("PDF Parsing Started", f"File: {doc_id}")
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
+                self._debug_print("Status", f"File {doc_id} already processed")
                 return self._load_extracted_data(doc_id)
+        print(f"\nProcessing PDF: {doc_id}")
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
+        self._debug_print("Summary", {
             'text_length': len(text),
             'images_count': len(images),
             'tables_count': len(tables),
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
         self._save_extracted_data(doc_id, text, images, tables)
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
+        """Сохранение извелеченных данных в Docstore"""
         data = {
             'text': text,
             'images': images,
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
+        """Подгрузка ранее извлеченных данных из Docstore"""
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
             return "", [], []
     def get_all_documents(self) -> Dict:
+        """Получение всех документов из Docstore"""
         all_docs = {}
         for json_file in self.docstore_path.glob("*_data.json"):
             doc_id = json_file.stem.replace("_data", "")

src/rag_system.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """
-Enhanced RAG System - Visual Image Analysis
-Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
-Then stores results in vector store
 """
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
@@ -17,20 +15,18 @@ from config import (
 class VisualMultimodalRAG:
     """
-    RAG system that:
-    1. Sends images as base64 to GPT-4o for visual analysis
-    2. Gets detailed visual descriptions and insights
-    3. Stores visual analysis in vector store
-    4. Enables image-based semantic search
     """
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
-        # Use gpt-4o for vision capabilities
         self.llm = ChatOpenAI(
-            model_name="gpt-4o-mini",  # CRITICAL: gpt-4o has vision
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
@@ -40,12 +36,12 @@ class VisualMultimodalRAG:
         self.visual_summaries_log = []
         if self.debug:
-            print("✅ VisualMultimodalRAG initialized with gpt-4o (vision model)")
     def _debug_print(self, label: str, data: any):
-        """Print debug information"""
         if self.debug:
-            print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
                 print(f"  Content: {str(data)[:300]}...")
@@ -53,7 +49,7 @@ class VisualMultimodalRAG:
                 print(f"  {data}")
     def _image_to_base64(self, image_path: str) -> str:
-        """Convert image file to base64 string"""
         try:
             with open(image_path, 'rb') as image_file:
                 image_data = base64.b64encode(image_file.read()).decode('utf-8')
@@ -64,27 +60,16 @@ class VisualMultimodalRAG:
     def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
         """
-        Send actual image (base64) to gpt-4o for visual analysis
-        Returns detailed visual analysis/description
-        gpt-4o can see:
-        - Charts, graphs, diagrams
-        - Tables and structured data
-        - Photos and drawings
-        - Handwritten text
-        - Screenshots
-        - Any visual content
         """
         if not os.path.exists(image_path):
             return f"[Image {image_idx}: File not found - {image_path}]"
         try:
-            # Convert image to base64
             image_base64 = self._image_to_base64(image_path)
             if not image_base64:
-                return f"[Image {image_idx}: Could not convert to base64]"
-            # Determine image type
             file_ext = Path(image_path).suffix.lower()
             media_type_map = {
                 '.jpg': 'image/jpeg',
@@ -95,9 +80,8 @@ class VisualMultimodalRAG:
             }
             media_type = media_type_map.get(file_ext, 'image/png')
-            print(f"🔍 Analyzing image {image_idx} visually (as {media_type})...")
-            # Create message with image
             message = HumanMessage(
                 content=[
                     {
@@ -108,41 +92,38 @@ class VisualMultimodalRAG:
                     },
                     {
                         "type": "text",
-                        "text": f"""Analyze this image in detail in {self.language}.
-Provide a comprehensive visual analysis including:
-1. **What you see** - Main objects, elements, structure
-2. **Data/Content** - Any numbers, text, charts, graphs
-3. **Purpose** - What this image is showing or representing
-4. **Key insights** - Important patterns, trends, or information
-5. **Connections** - How this relates to document content
-Be specific and detailed. Focus on visual information that cannot be extracted from text alone.
-Analysis:"""
                     }
                 ],
             )
-            # Call gpt-4o with vision
             response = self.llm.invoke([message])
             analysis = response.content.strip()
             if self.debug:
                 self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
-            print(f"✅ Image {image_idx} analyzed successfully")
             return analysis
         except Exception as e:
             error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
-            print(f"❌ Error analyzing image {image_idx}: {e}")
             return error_msg
     def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
         """
-        Analyze each image visually using gpt-4o vision
-        Returns list of {image_index, visual_analysis, type}
         """
         visual_analyses = []
@@ -150,10 +131,9 @@ Analysis:"""
             image_path = image.get('path', '')
             if not image_path:
-                print(f"⚠️  Image {idx}: No path provided")
                 continue
-            # Analyze image visually (not just OCR)
             visual_analysis = self.analyze_image_visually(image_path, idx)
             visual_analyses.append({
@@ -161,14 +141,14 @@ Analysis:"""
                 'image_index': idx,
                 'image_path': image_path,
                 'visual_analysis': visual_analysis,
-                'ocr_text': image.get('ocr_text', '')  # Keep OCR as backup
             })
         return visual_analyses
     def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
         """
-        Chunk text and summarize each chunk individually
         """
         chunks = []
         text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
@@ -180,13 +160,13 @@ Analysis:"""
                 continue
             try:
-                prompt = f"""Summarize this text chunk in {self.language}.
-Keep it concise. Extract key points, facts, and main ideas.
-Text Chunk:
 {chunk}
-Summary (2-3 sentences maximum):"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
@@ -210,7 +190,7 @@ Summary (2-3 sentences maximum):"""
     def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
         """
-        Summarize each table individually
         """
         summaries = []
@@ -221,13 +201,13 @@ Summary (2-3 sentences maximum):"""
                 continue
             try:
-                prompt = f"""Analyze and summarize this table/structured data in {self.language}.
-Extract key insights, row/column meanings, and important figures.
-Table Content:
 {table_content}
-Summary (2-3 sentences maximum):"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
@@ -258,12 +238,10 @@ Summary (2-3 sentences maximum):"""
         doc_id: str
     ) -> Dict:
         """
-        Main function: Analyze all components visually and store in vector store
-        Images are analyzed using gpt-4o vision (not just OCR)
         """
-        print(f"\n{'='*70}")
-        print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
-        print(f"{'='*70}")
         results = {
             'doc_id': doc_id,
@@ -273,14 +251,12 @@ Summary (2-3 sentences maximum):"""
             'total_stored': 0
         }
-        # 1. Analyze images VISUALLY using gpt-4o
-        print(f"\n🖼️ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
-        print(f"{'─'*70}")
         image_analyses = self.analyze_images_visually(images)
         results['image_visual_analyses'] = image_analyses
-        # Store each image analysis in vector store
         image_docs = {
             'text': ' | '.join([
                 f"Image {a['image_index']}: {a['visual_analysis']}"
@@ -291,7 +267,7 @@ Summary (2-3 sentences maximum):"""
         }
         for analysis in image_analyses:
-            print(f"  ✅ Image {analysis['image_index']} (visual analysis)")
             print(f"     Path: {analysis['image_path']}")
             print(f"     Analysis: {analysis['visual_analysis'][:100]}...")
@@ -302,13 +278,11 @@ Summary (2-3 sentences maximum):"""
                     f"{doc_id}_images_visual"
                 )
                 results['total_stored'] += len(image_analyses)
-                print(f"✅ Stored {len(image_analyses)} image visual analyses")
             except Exception as e:
-                print(f"❌ Error storing image analyses: {e}")
-        # 2. Summarize and store text chunks
-        print(f"\n📝 TEXT CHUNK SUMMARIZATION")
-        print(f"{'─'*70}")
         text_summaries = self.summarize_text_chunks(text)
         results['text_summaries'] = text_summaries
@@ -321,7 +295,7 @@ Summary (2-3 sentences maximum):"""
         }
         for summary in text_summaries:
-            print(f"  ✅ Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
         if text_summaries:
             try:
@@ -330,13 +304,11 @@ Summary (2-3 sentences maximum):"""
                     f"{doc_id}_text_chunks"
                 )
                 results['total_stored'] += len(text_summaries)
-                print(f"✅ Stored {len(text_summaries)} text chunk summaries")
             except Exception as e:
-                print(f"❌ Error storing text summaries: {e}")
-        # 3. Summarize and store tables
-        print(f"\n📋 TABLE SUMMARIZATION ({len(tables)} total)")
-        print(f"{'─'*70}")
         table_summaries = self.summarize_tables(tables)
         results['table_summaries'] = table_summaries
@@ -349,7 +321,7 @@ Summary (2-3 sentences maximum):"""
         }
         for summary in table_summaries:
-            print(f"  ✅ Table {summary['table_index']}: {summary['summary'][:50]}...")
         if table_summaries:
             try:
@@ -358,25 +330,20 @@ Summary (2-3 sentences maximum):"""
                     f"{doc_id}_tables"
                 )
                 results['total_stored'] += len(table_summaries)
-                print(f"✅ Stored {len(table_summaries)} table summaries")
             except Exception as e:
-                print(f"❌ Error storing table summaries: {e}")
-        # 4. Summary statistics
-        print(f"\n{'='*70}")
-        print(f"📊 STORAGE SUMMARY")
-        print(f"{'='*70}")
-        print(f"  Images analyzed visually & stored: {len(image_analyses)}")
-        print(f"  Text chunks summarized & stored: {len(text_summaries)}")
-        print(f"  Tables summarized & stored: {len(table_summaries)}")
         print(f"  Total items stored in vector: {results['total_stored']}")
-        print(f"{'='*70}")
         self.visual_summaries_log.append(results)
         return results
     def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
-        """Split text into overlapping chunks"""
         chunks = []
         start = 0
         while start < len(text):
@@ -386,16 +353,15 @@ Summary (2-3 sentences maximum):"""
         return chunks
     def get_visual_summaries_log(self) -> List[Dict]:
-        """Get all visual analysis logs"""
         return self.visual_summaries_log
 class AnsweringRAG:
     """
-    RAG system that:
-    1. Searches vector store for relevant content
-    2. ANALYZES search results
-    3. Generates intelligent answers based on context
     """
     def __init__(self, api_key: str = None, debug: bool = True):
@@ -403,7 +369,7 @@ class AnsweringRAG:
         self.debug = debug
         self.llm = ChatOpenAI(
-            model_name="gpt-4o-mini",  # Use gpt-4o for better understanding
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
@@ -413,10 +379,10 @@ class AnsweringRAG:
         self.answer_log = []
         if self.debug:
-            print("✅ AnsweringRAG initialized with answer generation")
     def _debug_print(self, label: str, data: any):
-        """Print debug information"""
         if self.debug:
             print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
@@ -431,9 +397,9 @@ class AnsweringRAG:
         search_results: List[Dict]
     ) -> Dict:
         """
-        Analyze search results and generate intelligent answer
-        Returns:
         {
             'question': user question,
             'answer': detailed answer,
@@ -443,22 +409,15 @@ class AnsweringRAG:
         }
         """
-        print(f"\n{'='*70}")
         print(f"ANALYZING QUESTION & GENERATING ANSWER")
-        print(f"{'='*70}")
-        print(f"\n❓ Question: {question}")
-        print(f"📊 Search Results Found: {len(search_results)}")
-        # Check if we have search results
         if not search_results:
-            print(f"⚠️  No search results found!")
-            answer = f"""I could not find relevant information in the document to answer your question: "{question}"
-Try:
-- Using different keywords
-- Breaking the question into smaller parts
-- Asking about other topics in the document"""
             result = {
                 'question': question,
@@ -470,7 +429,6 @@ Try:
             self.answer_log.append(result)
             return result
-        # Build context from search results
         context_parts = []
         for idx, result in enumerate(search_results, 1):
             content = result.get('content', '')
@@ -485,43 +443,39 @@ Try:
         full_context = "\n".join(context_parts)
-        self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
-        # Build prompt to analyze results and answer question
-        analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
-USER QUESTION:
 "{question}"
-RELEVANT CONTENT FROM DOCUMENT:
 {full_context}
-INSTRUCTIONS:
-1. Analyze the provided content carefully
-2. Extract information relevant to the question
-3. Synthesize a clear, comprehensive answer in {self.language}
-4. If the content doesn't fully answer the question, explain what information is available
-5. Be specific and cite the content when relevant
-6. Structure your answer clearly with key points
-ANSWER:"""
-        print(f"\n🔍 Analyzing search results...")
-        print(f"   Context size: {len(full_context)} characters")
         print(f"   Sources: {len(search_results)}")
         try:
-            # Call LLM to analyze and answer
             message = HumanMessage(content=analysis_prompt)
             response = self.llm.invoke([message])
             answer = response.content.strip()
-            # Determine confidence level
             confidence = self._estimate_confidence(len(search_results), answer)
-            print(f"✅ Answer generated successfully")
             print(f"   Confidence: {confidence}")
-            print(f"   Answer length: {len(answer)} characters")
             result = {
                 'question': question,
@@ -535,8 +489,8 @@ ANSWER:"""
             return result
         except Exception as e:
-            print(f"❌ Error generating answer: {e}")
-            answer = f"I encountered an error while analyzing the search results. Please try again."
             result = {
                 'question': question,
@@ -551,66 +505,14 @@ ANSWER:"""
             return result
     def _estimate_confidence(self, sources_count: int, answer: str) -> str:
-        """Estimate confidence level of answer"""
         answer_length = len(answer)
-        # High confidence: multiple sources, substantial answer
         if sources_count >= 3 and answer_length > 500:
             return "high"
-        # Medium confidence: some sources, decent answer
         elif sources_count >= 2 and answer_length > 200:
             return "medium"
-        # Low confidence: few sources or short answer
         else:
             return "low"
-    def get_answer_with_sources(
-        self,
-        question: str,
-        search_results: List[Dict]
-    ) -> Dict:
-        """
-        Get answer AND properly formatted sources
-        Returns both answer and formatted source citations
-        """
-        result = self.analyze_and_answer(question, search_results)
-        # Format sources for display
-        formatted_sources = []
-        for idx, source in enumerate(result['search_results'], 1):
-            formatted_sources.append({
-                'index': idx,
-                'type': source.get('type', 'unknown'),
-                'content': source.get('content', ''),
-                'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
-            })
-        result['formatted_sources'] = formatted_sources
-        return result
-    def get_answer_log(self) -> List[Dict]:
-        """Get all answer generation logs"""
-        return self.answer_log
-    def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
-        """Pretty print answer with sources"""
-        print(f"\n{'='*70}")
-        print(f"ANSWER TO: {result['question']}")
-        print(f"{'='*70}")
-        print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
-        print(f"{'-'*70}")
-        print(result['answer'])
-        print(f"{'-'*70}")
-        if result.get('formatted_sources'):
-            print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
-            for source in result['formatted_sources']:
-                print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
-                print(f"{source['content'][:max_source_length]}...")
-        print(f"\n{'='*70}")

 """
+RAG основной pipeline
 """
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
 class VisualMultimodalRAG:
     """
+    RAG - подготовительный этап:
+    1. Кодирует изображение в base64 и отправляет в gpt-4o-mini
+    2. Получает описание изображения
+    3. Сохраняет описание в векторное хранилище
     """
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
+            model_name=OPENAI_MODEL,
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         self.visual_summaries_log = []
         if self.debug:
+            print(f"VisualMultimodalRAG with {OPENAI_MODEL}")
     def _debug_print(self, label: str, data: any):
+        """Debug"""
         if self.debug:
+            print(f"\nDEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
                 print(f"  Content: {str(data)[:300]}...")
                 print(f"  {data}")
     def _image_to_base64(self, image_path: str) -> str:
+        """Конвертирует изображение в base64"""
         try:
             with open(image_path, 'rb') as image_file:
                 image_data = base64.b64encode(image_file.read()).decode('utf-8')
     def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
         """
+        Отправляет в модель изображение для суммаризации
         """
         if not os.path.exists(image_path):
             return f"[Image {image_idx}: File not found - {image_path}]"
         try:
             image_base64 = self._image_to_base64(image_path)
             if not image_base64:
+                return f"[Image {image_idx}: Error converting to base64]"
             file_ext = Path(image_path).suffix.lower()
             media_type_map = {
                 '.jpg': 'image/jpeg',
             }
             media_type = media_type_map.get(file_ext, 'image/png')
+            print(f" Analyzing image {image_idx}...")
             message = HumanMessage(
                 content=[
                     {
                     },
                     {
                         "type": "text",
+                        "text": f"""Ты - ассистент по сбору и обобщению информации. Проанализируй изображение.
+По результатам анализа предоставь информацию:
+1. Что изображено на картинке - основные объекты и элементы
+2. Тип данных и содержимое - числа, графики, зависимости.
+3. Назначение изображения - для чего оно представлено и что отображает
+4. Связь с текстом
+Будь краток и содержателен. Фокусируйся на визуальной информации.
+Результат:"""
                     }
                 ],
             )
             response = self.llm.invoke([message])
             analysis = response.content.strip()
             if self.debug:
                 self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
+            print(f" Image {image_idx} analyzed successfully")
             return analysis
         except Exception as e:
             error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
+            print(f" Error analyzing image {image_idx}: {e}")
             return error_msg
     def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
         """
+        Считывает изображения и отправляет на анализ
         """
         visual_analyses = []
             image_path = image.get('path', '')
             if not image_path:
+                print(f"  Image {idx}: No path")
                 continue
             visual_analysis = self.analyze_image_visually(image_path, idx)
             visual_analyses.append({
                 'image_index': idx,
                 'image_path': image_path,
                 'visual_analysis': visual_analysis,
+                'ocr_text': image.get('ocr_text', '')
             })
         return visual_analyses
     def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
         """
+        Отправляет куски текста на суммаризацию
         """
         chunks = []
         text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
                 continue
             try:
+                prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующий кусок текста.
+Выдели основные моменты, факты и идеи. Будь краток.
+Текст :
 {chunk}
+Результат:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
     def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
         """
+        Отправляет таблицы на суммаризацию
         """
         summaries = []
                 continue
             try:
+                prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующию таблицу.
+Выдели основные моменты, числа, и значения строк/колонок. Будь краток.
+Таблица:
 {table_content}
+Результат:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
         doc_id: str
     ) -> Dict:
         """
+        Основной pipeline анализирует и сохраняет документы в хранилище
         """
+        print(f"PROCESSING ANALYSIS: {doc_id}")
         results = {
             'doc_id': doc_id,
             'total_stored': 0
         }
+        print(f"\n VISUAL IMAGE ANALYSIS ({len(images)} )")
         image_analyses = self.analyze_images_visually(images)
         results['image_visual_analyses'] = image_analyses
         image_docs = {
             'text': ' | '.join([
                 f"Image {a['image_index']}: {a['visual_analysis']}"
         }
         for analysis in image_analyses:
+            print(f"     Image {analysis['image_index']}")
             print(f"     Path: {analysis['image_path']}")
             print(f"     Analysis: {analysis['visual_analysis'][:100]}...")
                     f"{doc_id}_images_visual"
                 )
                 results['total_stored'] += len(image_analyses)
+                print(f" Stored {len(image_analyses)} imagу analyses")
             except Exception as e:
+                print(f"Error storing image analyses: {e}")
+        print(f"\n TEXT CHUNK SUMMARIZATION")
         text_summaries = self.summarize_text_chunks(text)
         results['text_summaries'] = text_summaries
         }
         for summary in text_summaries:
+            print(f"   Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
         if text_summaries:
             try:
                     f"{doc_id}_text_chunks"
                 )
                 results['total_stored'] += len(text_summaries)
+                print(f" Stored {len(text_summaries)} text chunk summaries")
             except Exception as e:
+                print(f" Error text summaries: {e}")
+        print(f"\n TABLE SUMMARIZATION ({len(tables)}")
         table_summaries = self.summarize_tables(tables)
         results['table_summaries'] = table_summaries
         }
         for summary in table_summaries:
+            print(f"  Table {summary['table_index']}: {summary['summary'][:50]}...")
         if table_summaries:
             try:
                     f"{doc_id}_tables"
                 )
                 results['total_stored'] += len(table_summaries)
+                print(f" Stored {len(table_summaries)} table summaries")
             except Exception as e:
+                print(f" Error storing table summaries: {e}")
+        print(f" STORAGE SUMMARY")
+        print(f"  Images analyzed: {len(image_analyses)}")
+        print(f"  Text chunks summarized: {len(text_summaries)}")
+        print(f"  Tables summarized: {len(table_summaries)}")
         print(f"  Total items stored in vector: {results['total_stored']}")
         self.visual_summaries_log.append(results)
         return results
     def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
         chunks = []
         start = 0
         while start < len(text):
         return chunks
     def get_visual_summaries_log(self) -> List[Dict]:
         return self.visual_summaries_log
 class AnsweringRAG:
     """
+    RAG - работа с ответом на запрос:
+    1. Поиск в векторном хранилище
+    2. Анализ результатов
+    3. Предоставление ответа
     """
     def __init__(self, api_key: str = None, debug: bool = True):
         self.debug = debug
         self.llm = ChatOpenAI(
+            model_name=OPENAI_MODEL,
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         self.answer_log = []
         if self.debug:
+            print(" AnsweringRAG initialized")
     def _debug_print(self, label: str, data: any):
+        """Debug"""
         if self.debug:
             print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
         search_results: List[Dict]
     ) -> Dict:
         """
+        Проанализируй найденные документов и на основе их предоставь ответ на вопрос пользователя
+        Ответ:
         {
             'question': user question,
             'answer': detailed answer,
         }
         """
         print(f"ANALYZING QUESTION & GENERATING ANSWER")
+        print(f"\n Question: {question}")
+        print(f" Search Results: {len(search_results)}")
         if not search_results:
+            print(f" No search results found!")
+            answer = f"""Релевантная информация в документах отсутствует: "{question}"
+"""
             result = {
                 'question': question,
             self.answer_log.append(result)
             return result
         context_parts = []
         for idx, result in enumerate(search_results, 1):
             content = result.get('content', '')
         full_context = "\n".join(context_parts)
+        self._debug_print("Context Prepared", f"{len(context_parts)} sources")
+        analysis_prompt = f"""Ты - ассистент по анализу документов и ответов на вопросы по ним.
+ВОПРОС:
 "{question}"
+РЕЛЕВАНТНАЯ ИНФОРМАЦИЯ:
 {full_context}
+ИНСТРУКЦИИ:
+1. Проанализируй предоставленный контент
+2. Выдели информацию имеющую отношение к вопросу
+3. Предоставь понятный и исчерпывающий ответ
+4. Если контент полностью не отвечает на вопрос предосавь информацию которая доступна в контенте
+5. Построй свой ответ опираясь на ключевые моменты
+Ответ:"""
+        print(f"\n Analyzing search results...")
+        print(f"   Context size: {len(full_context)} chars")
         print(f"   Sources: {len(search_results)}")
         try:
             message = HumanMessage(content=analysis_prompt)
             response = self.llm.invoke([message])
             answer = response.content.strip()
             confidence = self._estimate_confidence(len(search_results), answer)
+            print(f" Answer generated successfully")
             print(f"   Confidence: {confidence}")
+            print(f"   Answer length: {len(answer)} chars")
             result = {
                 'question': question,
             return result
         except Exception as e:
+            print(f" Error generating answer: {e}")
+            answer = f"Error while analyzing the search results."
             result = {
                 'question': question,
             return result
     def _estimate_confidence(self, sources_count: int, answer: str) -> str:
+        """Уверенность в ответе на основании найденных источников информации"""
         answer_length = len(answer)
         if sources_count >= 3 and answer_length > 500:
             return "high"
         elif sources_count >= 2 and answer_length > 200:
             return "medium"
         else:
             return "low"

src/vector_store.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """
-Vector Store and Embeddings Module using ChromaDB with sentence-transformers
-UPDATED for ChromaDB v0.4.22+ (auto-persist, no manual persist needed)
 """
 import os
 import json
@@ -12,14 +11,14 @@ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
 class CLIPEmbedder:
-    """Custom embedder using sentence-transformers for multimodal content"""
     def __init__(self, model_name: str = EMBEDDING_MODEL):
-        print(f"🔄 Loading embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
-        print(f"✅ Model loaded successfully")
     def embed(self, text: str) -> List[float]:
-        """Generate embedding for text"""
         try:
             embedding = self.model.encode(text, convert_to_numpy=False)
             return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
@@ -28,7 +27,7 @@ class CLIPEmbedder:
             return [0.0] * EMBEDDING_DIM
     def embed_batch(self, texts: List[str]) -> List[List[float]]:
-        """Generate embeddings for batch of texts"""
         try:
             embeddings = self.model.encode(texts, convert_to_numpy=False)
             return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
@@ -38,34 +37,31 @@ class CLIPEmbedder:
 class VectorStore:
-    """Vector store manager using ChromaDB (v0.4.22+ with auto-persist)"""
     def __init__(self):
         self.persist_directory = CHROMA_DB_PATH
         self.embedder = CLIPEmbedder()
-        print(f"\n🔄 Initializing ChromaDB at: {self.persist_directory}")
-        # NEW ChromaDB v0.4.22+ - PersistentClient auto-persists
         try:
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
-            print(f"✅ ChromaDB PersistentClient initialized")
         except Exception as e:
-            print(f"❌ Error initializing ChromaDB: {e}")
-            print(f"Trying fallback initialization...")
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
-        # Get or create collection
         try:
-            self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
             count = self.collection.count()
-            print(f"✅ Collection loaded: {count} items in store")
         except Exception as e:
             print(f"Error with collection: {e}")
             self.collection = self.client.get_or_create_collection(
@@ -73,14 +69,13 @@ class VectorStore:
             )
     def add_documents(self, documents: List[Dict], doc_id: str):
-        """Add documents to vector store"""
         texts = []
         metadatas = []
         ids = []
-        print(f"\n📚 Adding documents for: {doc_id}")
-        # Add text chunks
         if 'text' in documents and documents['text']:
             chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
             for idx, chunk in enumerate(chunks):
@@ -91,9 +86,8 @@ class VectorStore:
                     'chunk_idx': str(idx)
                 })
                 ids.append(f"{doc_id}_text_{idx}")
-            print(f"  ✅ Text: {len(chunks)} chunks")
-        # Add image descriptions and OCR text
         if 'images' in documents:
             image_count = 0
             for idx, image_data in enumerate(documents['images']):
@@ -108,9 +102,8 @@ class VectorStore:
                     ids.append(f"{doc_id}_image_{idx}")
                     image_count += 1
             if image_count > 0:
-                print(f"  ✅ Images: {image_count} with OCR text")
-        # Add table content
         if 'tables' in documents:
             table_count = 0
             for idx, table_data in enumerate(documents['tables']):
@@ -124,14 +117,12 @@ class VectorStore:
                     ids.append(f"{doc_id}_table_{idx}")
                     table_count += 1
             if table_count > 0:
-                print(f"  ✅ Tables: {table_count}")
         if texts:
-            # Generate embeddings
             print(f"  🔄 Generating {len(texts)} embeddings...")
             embeddings = self.embedder.embed_batch(texts)
-            # Add to collection
             try:
                 self.collection.add(
                     ids=ids,
@@ -139,14 +130,12 @@ class VectorStore:
                     embeddings=embeddings,
                     metadatas=metadatas
                 )
-                print(f"✅ Successfully added {len(texts)} items to vector store")
-                # Auto-persist happens here
-                print(f"✅ Data persisted automatically to: {self.persist_directory}")
             except Exception as e:
-                print(f"❌ Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
-        """Search vector store for similar documents"""
         try:
             query_embedding = self.embedder.embed(query)
@@ -155,7 +144,6 @@ class VectorStore:
                 n_results=n_results
             )
-            # Format results
             formatted_results = []
             if results['documents']:
                 for i, doc in enumerate(results['documents'][0]):
@@ -175,7 +163,7 @@ class VectorStore:
             return []
     def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
-        """Split text into chunks with overlap"""
         chunks = []
         start = 0
         while start < len(text):
@@ -185,7 +173,7 @@ class VectorStore:
         return chunks
     def get_collection_info(self) -> Dict:
-        """Get information about the collection"""
         try:
             count = self.collection.count()
             return {
@@ -199,35 +187,23 @@ class VectorStore:
             return {'status': 'error', 'message': str(e)}
     def delete_by_doc_id(self, doc_id: str):
-        """Delete all documents related to a specific doc_id"""
         try:
-            # Get all IDs with this doc_id
             results = self.collection.get(where={'doc_id': doc_id})
             if results['ids']:
                 self.collection.delete(ids=results['ids'])
-                print(f"✅ Deleted {len(results['ids'])} documents for {doc_id}")
-                # Auto-persist on delete
-                print(f"✅ Changes persisted automatically")
         except Exception as e:
             print(f"Error deleting documents: {e}")
-    def persist(self):
-        """
-        No-op for compatibility with older code.
-        ChromaDB v0.4.22+ uses PersistentClient which auto-persists.
-        This method kept for backward compatibility.
-        """
-        print("✅ Vector store is using auto-persist (no manual persist needed)")
     def clear_all(self):
-        """Clear all documents from collection"""
         try:
-            # Delete collection and recreate
             self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
-            print("✅ Collection cleared and reset")
         except Exception as e:
             print(f"Error clearing collection: {e}")

 """
+Векторное хранилище и Эмбеддер"
 """
 import os
 import json
 class CLIPEmbedder:
+    """Эмбеддер"""
     def __init__(self, model_name: str = EMBEDDING_MODEL):
+        print(f"Embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
+        print(f"Model loaded successfully")
     def embed(self, text: str) -> List[float]:
+        """Эмбеддинг для текста"""
         try:
             embedding = self.model.encode(text, convert_to_numpy=False)
             return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
             return [0.0] * EMBEDDING_DIM
     def embed_batch(self, texts: List[str]) -> List[List[float]]:
+        """Эмбеддинг для текста"""
         try:
             embeddings = self.model.encode(texts, convert_to_numpy=False)
             return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
 class VectorStore:
+    """Векторное хранилище"""
     def __init__(self):
         self.persist_directory = CHROMA_DB_PATH
         self.embedder = CLIPEmbedder()
+        print(f"\nInitializing ChromaDB: {self.persist_directory}")
         try:
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
+            print(f"ChromaDB initialized")
         except Exception as e:
+            print(f"Error initializing ChromaDB: {e}")
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
         try:
+            self.collection = self.client.get_or_create_colletion(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
             count = self.collection.count()
+            print(f"Collection loaded: {count} items in store")
         except Exception as e:
             print(f"Error with collection: {e}")
             self.collection = self.client.get_or_create_collection(
             )
     def add_documents(self, documents: List[Dict], doc_id: str):
+        """Добавление документов в векторное хранилище"""
         texts = []
         metadatas = []
         ids = []
+        print(f"\nAdding document: {doc_id}")
         if 'text' in documents and documents['text']:
             chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
             for idx, chunk in enumerate(chunks):
                     'chunk_idx': str(idx)
                 })
                 ids.append(f"{doc_id}_text_{idx}")
+            print(f"  Text: {len(chunks)} chunks")
         if 'images' in documents:
             image_count = 0
             for idx, image_data in enumerate(documents['images']):
                     ids.append(f"{doc_id}_image_{idx}")
                     image_count += 1
             if image_count > 0:
+                print(f"  Images: {image_count} with OCR text")
         if 'tables' in documents:
             table_count = 0
             for idx, table_data in enumerate(documents['tables']):
                     ids.append(f"{doc_id}_table_{idx}")
                     table_count += 1
             if table_count > 0:
+                print(f"  Tables: {table_count}")
         if texts:
             print(f"  🔄 Generating {len(texts)} embeddings...")
             embeddings = self.embedder.embed_batch(texts)
             try:
                 self.collection.add(
                     ids=ids,
                     embeddings=embeddings,
                     metadatas=metadatas
                 )
+                print(f"Successfully added {len(texts)} items to vector store")
             except Exception as e:
+                print(f"Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
+        """Поиск в векторном хранилище"""
         try:
             query_embedding = self.embedder.embed(query)
                 n_results=n_results
             )
             formatted_results = []
             if results['documents']:
                 for i, doc in enumerate(results['documents'][0]):
             return []
     def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
+        """Сплит текста"""
         chunks = []
         start = 0
         while start < len(text):
         return chunks
     def get_collection_info(self) -> Dict:
+        """Получение информации о коллекции в вектороном хранилище"""
         try:
             count = self.collection.count()
             return {
             return {'status': 'error', 'message': str(e)}
     def delete_by_doc_id(self, doc_id: str):
+        """Удаление документа из векторного хранилища"""
         try:
             results = self.collection.get(where={'doc_id': doc_id})
             if results['ids']:
                 self.collection.delete(ids=results['ids'])
+                print(f"Deleted {len(results['ids'])} documents for {doc_id}")
         except Exception as e:
             print(f"Error deleting documents: {e}")
     def clear_all(self):
+        """Очистка хранилища"""
         try:
             self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
+            print("Collection cleared")
         except Exception as e:
             print(f"Error clearing collection: {e}")