final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 19, 2025

Commit

a6680e7

1 Parent(s): d993ee2

Simplify

Browse files

Files changed (5) hide show

src/app.py +240 -83
src/config.py +20 -9
src/pdf_parser.py +41 -11
src/rag_system.py +155 -53
src/vector_store.py +49 -18

src/app.py CHANGED Viewed

@@ -1,25 +1,41 @@
 import streamlit as st
 import os
 from pathlib import Path
 from pdf_parser import PDFParser
 from vector_store import VectorStore
-from rag_system import VisualMultimodalRAG, AnsweringRAG
 from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
 st.set_page_config(
-    page_title="Мультимодальная система RAG LLM",
-    page_icon="",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 if 'api_key_set' not in st.session_state:
     st.session_state.api_key_set = False
 if 'api_key' not in st.session_state:
     st.session_state.api_key = None
-if 'visual_rag_system' not in st.session_state:
     st.session_state.visual_rag_system = None
 if 'vector_store' not in st.session_state:
@@ -40,26 +56,39 @@ if 'current_images' not in st.session_state:
 if 'current_tables' not in st.session_state:
     st.session_state.current_tables = None
-if 'processing_results' not in st.session_state:
     st.session_state.processing_results = None
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
-st.title("Мультимодальная система RAG LLM")
-st.markdown("""
-Обработка PDF-документов с анализом визуального контента
 """)
 with st.sidebar:
-    st.header("Конфигурация")
-    st.subheader("Ключ API OpenAI")
     api_key = st.text_input(
-        "Введите ваш ключ API OpenAI:",
         type="password",
         key="api_key_input"
     )
@@ -68,116 +97,154 @@ with st.sidebar:
         st.session_state.api_key = api_key
         st.session_state.api_key_set = True
         if st.session_state.visual_rag_system is None:
             try:
-                st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)
                 st.session_state.vector_store = VectorStore()
                 st.session_state.parser = PDFParser(debug=True)
-                st.success("Ключ API установлен")
             except Exception as e:
-                st.error(f"Ошибка при инициализации систем: {e}")
     else:
         st.session_state.api_key_set = False
-        st.warning("Введите ключ API для продолжения")
     st.divider()
-    st.subheader("Векторное хранилище")
     if st.session_state.vector_store:
         try:
             info = st.session_state.vector_store.get_collection_info()
-            st.metric("Элементов в хранилище", info['count'])
-            st.caption(f"Путь: {info['persist_path']}")
         except Exception as e:
-            st.error(f"Ошибка получения информации о хранилище: {e}")
     else:
-        st.info("Установите ключ API для инициализации векторного хранилища")
     st.divider()
-    st.subheader("Управление документами")
-    if st.button("Очистить векторное хранилище"):
         if st.session_state.vector_store:
             try:
                 st.session_state.vector_store.clear_all()
-                st.success("Векторное хранилище очищено")
             except Exception as e:
-                st.error(f"Ошибка при очистке хранилища: {e}")
-st.header("Загрузка PDF-документа")
 uploaded_file = st.file_uploader(
-    "Выберите PDF-файл",
     type=['pdf'],
-    help="PDF с текстом, изображениями и таблицами"
 )
 if uploaded_file is not None:
     upload_path = Path(UPLOAD_FOLDER)
     upload_path.mkdir(exist_ok=True)
     file_path = upload_path / uploaded_file.name
     with open(file_path, 'wb') as f:
         f.write(uploaded_file.getbuffer())
-    st.success(f"Файл сохранён: {uploaded_file.name}")
-    if st.button("Распарсить PDF"):
         if not st.session_state.api_key_set:
-            st.error("Введите ключ API для продолжения")
         else:
             try:
-                with st.spinner("Парсинг PDF..."):
-                    print("PARSING: " + uploaded_file.name)
                     parser = st.session_state.parser
                     text, images, tables = parser.parse_pdf(str(file_path))
                     st.session_state.current_document = uploaded_file.name
                     st.session_state.current_text = text
                     st.session_state.current_images = images
                     st.session_state.current_tables = tables
                     col1, col2, col3 = st.columns(3)
                     with col1:
-                        st.metric("Текст", f"{len(text):,} символов")
                     with col2:
-                        st.metric("Изображения", len(images))
                     with col3:
-                        st.metric("Таблицы", len(tables))
                     if images:
-                        st.subheader("Извлечённые изображения")
                         for idx, img in enumerate(images):
                             ocr_text = img.get('ocr_text', '')
                             ocr_len = len(ocr_text)
                             if ocr_len > 0:
-                                st.success(f"Изображение {idx}: {ocr_len} символов (OCR)")
                             else:
-                                st.warning(f"Изображение {idx}: Текст OCR не найден (будет использоваться визуальный анализ)")
-                    st.success("Парсинг PDF завершён!")
             except Exception as e:
-                st.error(f"Ошибка при парсинге PDF: {e}")
-st.divider()
-st.header("Анализ")
-if st.button("Анализировать"):
     if not st.session_state.api_key_set:
-        st.error("Введите ключ API для продолжения")
     elif st.session_state.current_text is None:
-        st.error("Распарсьте PDF-документ")
     else:
         try:
-            with st.spinner("Анализ изображений с помощью gpt-4o-mini..."):
-                print("ANALYSIS")
                 visual_rag = st.session_state.visual_rag_system
                 vector_store = st.session_state.vector_store
                 results = visual_rag.process_and_store_document(
                     text=st.session_state.current_text,
-                    images=st.session_state.current_images,
                     tables=st.session_state.current_tables,
                     vector_store=vector_store,
                     doc_id=st.session_state.current_document or "current_doc"
@@ -185,97 +252,187 @@ if st.button("Анализировать"):
                 st.session_state.processing_results = results
-                st.success("Анализ завершён и сохранён!")
                 col1, col2, col3 = st.columns(3)
                 with col1:
-                    st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
                 with col2:
-                    st.metric("Фрагментов текста", len(results['text_summaries']))
                 with col3:
-                    st.metric("Проанализировано таблиц", len(results['table_summaries']))
-                st.metric("Всего сохранено в вектор", results['total_stored'])
         except Exception as e:
-            st.error(f"Ошибка при анализе: {e}")
 st.divider()
-st.header("Задать вопрос о документе")
 if st.session_state.api_key_set and st.session_state.answering_rag is None:
     st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
 question = st.text_area(
-    "Введите ваш вопрос:",
     height=100,
-    placeholder="О чем говорится в документе?"
 )
-if st.button("Поиск и генерация ответа"):
     if not st.session_state.api_key_set:
-        st.error("Введите ключ API для продолжения")
     elif st.session_state.current_text is None:
-        st.error("Распарсьте PDF-документ")
     elif not question:
-        st.error("Введите вопрос")
     else:
         try:
-            with st.spinner("Поиск в документе и анализ..."):
-                print("QUESTION: " + question)
                 store = st.session_state.vector_store
                 doc_name = st.session_state.current_document or "current_doc"
                 doc_data = {
                     'text': st.session_state.current_text,
                     'images': [],
                     'tables': []
                 }
                 store.add_documents(doc_data, doc_name)
                 search_results = store.search(question, n_results=5)
                 answering_rag = st.session_state.answering_rag
                 result = answering_rag.analyze_and_answer(question, search_results)
-                st.success("Анализ завершён!")
-                st.subheader("Ответ")
                 col1, col2, col3 = st.columns(3)
                 with col1:
-                    confidence_map = {
-                        'high': 'ВЫСОКАЯ',
-                        'medium': 'СРЕДНЯЯ',
-                        'low': 'НИЗКАЯ'
-                    }
-                    confidence_text = confidence_map.get(result['confidence'], result['confidence'].upper())
-                    st.metric("Уверенность", confidence_text)
                 with col2:
-                    st.metric("Использовано источников", result['sources_used'])
                 with col3:
                     if result['sources_used'] > 0:
-                        st.metric("Сред. релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
                 st.write(result['answer'])
-                if st.checkbox("Показать исходные документы"):
-                    st.subheader("Источники, использованные в ответе")
-                    for source in result.get('formatted_sources', []):
                         relevance = source['relevance']
                         relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
                         with st.expander(
-                            f"Источник {source['index']} - {source['type'].upper()} "
                             f"[{relevance_bar}] {relevance:.0%}"
                         ):
                             st.write(source['content'])
         except Exception as e:
-            st.error(f"Ошибка при обработке вопроса: {e}")
 st.divider()
 st.caption(
-    "Мультимодальная система RAG"
 )

+"""
+Multimodal RAG LLM System - Streamlit App
+Complete working version with VISUAL image analysis using gpt-4o
+"""
 import streamlit as st
 import os
 from pathlib import Path
+# Import optimized versions
 from pdf_parser import PDFParser
 from vector_store import VectorStore
+from rag_system import VisualMultimodalRAG  # NEW - Vision model
 from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
+# ============================================================================
+# PAGE CONFIGURATION
+# ============================================================================
 st.set_page_config(
+    page_title="📄 Multimodal RAG LLM System",
+    page_icon="🤖",
     layout="wide",
     initial_sidebar_state="expanded"
 )
+# ============================================================================
+# SESSION STATE INITIALIZATION
+# ============================================================================
 if 'api_key_set' not in st.session_state:
     st.session_state.api_key_set = False
 if 'api_key' not in st.session_state:
     st.session_state.api_key = None
+if 'visual_rag_system' not in st.session_state:  # NEW - Vision model
     st.session_state.visual_rag_system = None
 if 'vector_store' not in st.session_state:
 if 'current_tables' not in st.session_state:
     st.session_state.current_tables = None
+if 'processing_results' not in st.session_state:  # NEW
     st.session_state.processing_results = None
 if 'answering_rag' not in st.session_state:
     st.session_state.answering_rag = None
+# ============================================================================
+# MAIN HEADER
+# ============================================================================
+st.title("📄 Multimodal RAG LLM System")
+st.markdown("""
+Process PDF documents with visual image analysis:
+- **PDF Parser** with OCR for Russian & English
+- **Visual Analysis** (gpt-4o) for image understanding
+- **Vector Store** (ChromaDB) for semantic search
+- **Individual Component** summarization and storage
 """)
+# ============================================================================
+# SIDEBAR - CONFIGURATION
+# ============================================================================
 with st.sidebar:
+    st.header("⚙️ Configuration")
+    # API Key Section
+    st.subheader("🔑 OpenAI API Key")
     api_key = st.text_input(
+        "Enter your OpenAI API key:",
         type="password",
         key="api_key_input"
     )
         st.session_state.api_key = api_key
         st.session_state.api_key_set = True
+        # Initialize RAG systems if not already done
         if st.session_state.visual_rag_system is None:
             try:
+                st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)  # NEW
                 st.session_state.vector_store = VectorStore()
                 st.session_state.parser = PDFParser(debug=True)
+                st.success("✅ API Key set & systems initialized")
             except Exception as e:
+                st.error(f"Error initializing systems: {e}")
     else:
         st.session_state.api_key_set = False
+        st.warning("⚠️ Please enter your API key to continue")
     st.divider()
+    # Vector Store Status
+    st.subheader("📊 Vector Store Status")
     if st.session_state.vector_store:
         try:
             info = st.session_state.vector_store.get_collection_info()
+            st.metric("Items in Store", info['count'])
+            st.metric("Status", info['status'])
+            st.caption(f"Path: {info['persist_path']}")
         except Exception as e:
+            st.error(f"Error getting store info: {e}")
     else:
+        st.info("Set API key to initialize vector store")
     st.divider()
+    # Document Management
+    st.subheader("📁 Document Management")
+    if st.button("🔄 Clear Vector Store"):
         if st.session_state.vector_store:
             try:
                 st.session_state.vector_store.clear_all()
+                st.success("✅ Vector store cleared")
             except Exception as e:
+                st.error(f"Error clearing store: {e}")
+# ============================================================================
+# MAIN CONTENT
+# ============================================================================
+# Upload Section
+st.header("📤 Upload PDF Document")
 uploaded_file = st.file_uploader(
+    "Choose a PDF file",
     type=['pdf'],
+    help="PDF with text, images, and tables"
 )
 if uploaded_file is not None:
+    # Save uploaded file
     upload_path = Path(UPLOAD_FOLDER)
     upload_path.mkdir(exist_ok=True)
     file_path = upload_path / uploaded_file.name
     with open(file_path, 'wb') as f:
         f.write(uploaded_file.getbuffer())
+    st.success(f"✅ File saved: {uploaded_file.name}")
+    # Parse PDF
+    if st.button("🔍 Parse PDF"):
         if not st.session_state.api_key_set:
+            st.error("❌ Please set OpenAI API key first")
         else:
             try:
+                with st.spinner("📄 Parsing PDF..."):
+                    print(f"\n{'='*70}")
+                    print(f"PARSING: {uploaded_file.name}")
+                    print(f"{'='*70}")
+                    # Parse PDF - returns text, images, tables
                     parser = st.session_state.parser
                     text, images, tables = parser.parse_pdf(str(file_path))
+                    # Store in session state
                     st.session_state.current_document = uploaded_file.name
                     st.session_state.current_text = text
                     st.session_state.current_images = images
                     st.session_state.current_tables = tables
+                    # Display results
                     col1, col2, col3 = st.columns(3)
                     with col1:
+                        st.metric("📝 Text", f"{len(text):,} chars")
                     with col2:
+                        st.metric("🖼️ Images", len(images))
                     with col3:
+                        st.metric("📋 Tables", len(tables))
+                    # Show image OCR details
                     if images:
+                        st.subheader("🖼️ Extracted Images")
                         for idx, img in enumerate(images):
                             ocr_text = img.get('ocr_text', '')
                             ocr_len = len(ocr_text)
                             if ocr_len > 0:
+                                st.success(f"✅ Image {idx}: {ocr_len} characters (OCR)")
                             else:
+                                st.warning(f"⚠️ Image {idx}: No OCR text (will use visual analysis)")
+                    st.success("✅ PDF parsing complete!")
             except Exception as e:
+                st.error(f"❌ Error parsing PDF: {e}")
+                print(f"Error: {e}")
+# ============================================================================
+# VISUAL IMAGE ANALYSIS & COMPONENT STORAGE
+# ============================================================================
+st.divider()
+st.header("🖼️ Visual Analysis & Storage")
+st.info("""
+**How it works:**
+1. Images are sent to gpt-4o for visual analysis (not just text OCR)
+2. Text is split into chunks and each chunk is summarized
+3. Tables are analyzed individually
+4. ALL summaries are stored in the vector store for semantic search
+""")
+if st.button("🖼️ Analyze Images Visually & Store Components"):
     if not st.session_state.api_key_set:
+        st.error("❌ Please set OpenAI API key first")
     elif st.session_state.current_text is None:
+        st.error("❌ Please parse a PDF document first")
     else:
         try:
+            with st.spinner("🖼️ Analyzing images visually with gpt-4o..."):
+                print(f"\n{'='*70}")
+                print(f"VISUAL IMAGE ANALYSIS")
+                print(f"{'='*70}")
+                # Process with visual analysis
                 visual_rag = st.session_state.visual_rag_system
                 vector_store = st.session_state.vector_store
                 results = visual_rag.process_and_store_document(
                     text=st.session_state.current_text,
+                    images=st.session_state.current_images,    # Actual images sent to gpt-4o
                     tables=st.session_state.current_tables,
                     vector_store=vector_store,
                     doc_id=st.session_state.current_document or "current_doc"
                 st.session_state.processing_results = results
+                # Display results
+                st.success("✅ Visual analysis complete & stored!")
                 col1, col2, col3 = st.columns(3)
                 with col1:
+                    st.metric("🖼️ Images Analyzed", len(results['image_visual_analyses']))
                 with col2:
+                    st.metric("📝 Text Chunks", len(results['text_summaries']))
                 with col3:
+                    st.metric("📋 Tables Analyzed", len(results['table_summaries']))
+                st.metric("📊 Total Stored in Vector", results['total_stored'])
+                # Show image visual analyses
+                if results['image_visual_analyses']:
+                    st.subheader("🖼️ Visual Image Analyses (gpt-4o)")
+                    for img_analysis in results['image_visual_analyses']:
+                        with st.expander(f"Image {img_analysis['image_index']} - Visual Analysis"):
+                            st.write("**Visual Analysis by gpt-4o:**")
+                            st.write(img_analysis['visual_analysis'])
+                            st.write("**Image Path:**")
+                            st.code(img_analysis['image_path'])
+                            if img_analysis['ocr_text']:
+                                st.write("**OCR Text (backup):**")
+                                st.text(img_analysis['ocr_text'][:500])
+                # Show text chunk summaries
+                if results['text_summaries']:
+                    st.subheader("📝 Text Chunk Summaries")
+                    for chunk_summary in results['text_summaries']:
+                        with st.expander(
+                            f"Chunk {chunk_summary['chunk_index']} "
+                            f"({chunk_summary['chunk_length']} chars)"
+                        ):
+                            st.write("**Summary:**")
+                            st.write(chunk_summary['summary'])
+                            st.write("**Original Text (first 500 chars):**")
+                            st.text(chunk_summary['original_text'])
+                # Show table analyses
+                if results['table_summaries']:
+                    st.subheader("📋 Table Analyses")
+                    for table_summary in results['table_summaries']:
+                        with st.expander(
+                            f"Table {table_summary['table_index']} "
+                            f"({table_summary['table_length']} chars)"
+                        ):
+                            st.write("**Analysis:**")
+                            st.write(table_summary['summary'])
+                            st.write("**Original Content (first 500 chars):**")
+                            st.text(table_summary['original_content'])
+                print(f"\n✅ Visual analysis processing complete!")
         except Exception as e:
+            st.error(f"❌ Error during visual analysis: {e}")
+            print(f"Error: {e}")
+# ============================================================================
+# QUESTION & ANSWERING
+# ============================================================================
 st.divider()
+st.header("❓ Ask Questions About Document")
+# Initialize answering system if not done
+if 'answering_rag' not in st.session_state:
+    st.session_state.answering_rag = None
+# Create answering system when API key is set
 if st.session_state.api_key_set and st.session_state.answering_rag is None:
+    from rag_system import AnsweringRAG
     st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
 question = st.text_area(
+    "Enter your question:",
     height=100,
+    placeholder="What does the document say about...?"
 )
+if st.button("🔍 Search & Generate Answer"):
     if not st.session_state.api_key_set:
+        st.error("❌ Please set OpenAI API key first")
     elif st.session_state.current_text is None:
+        st.error("❌ Please parse a PDF document first")
     elif not question:
+        st.error("❌ Please enter a question")
     else:
         try:
+            with st.spinner("🔄 Searching document and analyzing..."):
+                print(f"\n{'='*70}")
+                print(f"QUESTION: {question}")
+                print(f"{'='*70}")
+                # Search vector store
                 store = st.session_state.vector_store
+                # Add documents to store if needed
                 doc_name = st.session_state.current_document or "current_doc"
                 doc_data = {
                     'text': st.session_state.current_text,
                     'images': [],
                     'tables': []
                 }
                 store.add_documents(doc_data, doc_name)
+                # Search for relevant results
                 search_results = store.search(question, n_results=5)
+                print(f"\n📊 Search Results Found: {len(search_results)}")
+                # Analyze results and generate answer
                 answering_rag = st.session_state.answering_rag
                 result = answering_rag.analyze_and_answer(question, search_results)
+                # Display answer prominently
+                st.success("✅ Analysis complete!")
+                st.subheader("📝 Answer")
+                # Show confidence level
                 col1, col2, col3 = st.columns(3)
                 with col1:
+                    confidence_color = {
+                        'high': '🟢',
+                        'medium': '🟡',
+                        'low': '🔴'
+                    }.get(result['confidence'], '⚪')
+                    st.metric("Confidence", f"{confidence_color} {result['confidence'].upper()}")
                 with col2:
+                    st.metric("Sources Used", result['sources_used'])
                 with col3:
                     if result['sources_used'] > 0:
+                        st.metric("Avg Relevance", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
+                # Display the generated answer
                 st.write(result['answer'])
+                # Show sources
+                if st.checkbox("📚 Show Source Documents"):
+                    st.subheader("Sources Used in Answer")
+                    for idx, source in enumerate(result['formatted_sources'], 1):
                         relevance = source['relevance']
                         relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
                         with st.expander(
+                            f"Source {idx} - {source['type'].upper()} "
                             f"[{relevance_bar}] {relevance:.0%}"
                         ):
                             st.write(source['content'])
+                print(f"\n✅ Answer generation complete!")
         except Exception as e:
+            st.error(f"❌ Error processing question: {e}")
+            print(f"Error: {e}")
+# ============================================================================
+# FOOTER
+# ============================================================================
 st.divider()
+col1, col2, col3 = st.columns(3)
+with col1:
+    st.info("📖 **Text Processing**: PyPDF2 extraction with UTF-8 support")
+with col2:
+    st.info("🖼️ **Visual Analysis**: GPT-4o vision for image understanding")
+with col3:
+    st.info("📊 **Vector Storage**: ChromaDB with auto-persist")
 st.caption(
+    "Multimodal RAG System | "
+    "Visual Image Analysis | "
+    "Russian Language Support | "
+    "Individual Component Summarization"
 )

src/config.py CHANGED Viewed

@@ -1,31 +1,42 @@
 import os
 from pathlib import Path
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
-OPENAI_MODEL = "gpt-4o-mini"
-USE_CACHE = True
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
-MAX_CHUNK_SIZE = 500
-CHUNK_OVERLAP = 50
-TEMPERATURE = 0.3
-MAX_TOKENS = 500
 LANGUAGE = "russian"
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
 MAX_PDF_SIZE_MB = 50
-BATCH_SEARCH_RESULTS = 3
-CACHE_RESPONSES = True
-SUMMARIZE_FIRST = True

+"""
+Configuration file for Multimodal RAG LLM System
+"""
 import os
 from pathlib import Path
+# API Configuration
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+OPENAI_MODEL = "gpt-4o-mini"  # Cheaper model variant
+USE_CACHE = True  # Enable response caching
+# Vector Store Configuration
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
+# Embedding Model Configuration
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
+# System Configuration
+MAX_CHUNK_SIZE = 500  # Smaller chunks = fewer tokens
+CHUNK_OVERLAP = 50    # Less overlap = fewer chunks
+TEMPERATURE = 0.3     # Lower = faster, cheaper
+MAX_TOKENS = 500      # Limit response size (vs 1500)
+# Language Support
 LANGUAGE = "russian"
+# Create necessary directories
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
+# PDF Upload Configuration
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
 MAX_PDF_SIZE_MB = 50
+# TOKEN OPTIMIZATION SETTINGS
+BATCH_SEARCH_RESULTS = 3  # Return only top 3 (not 5)
+CACHE_RESPONSES = True    # Cache Q&A responses
+SUMMARIZE_FIRST = True    # Summarize PDFs once, not per query

src/pdf_parser.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import os
 import json
 import hashlib
@@ -17,21 +20,29 @@ class PDFParser:
         self.processed_files = self._load_processed_files()
         self.debug = debug
         self._configure_tesseract()
         if self.debug:
-            print("PDFParser initialized")
     def _configure_tesseract(self):
         try:
             pytesseract.get_tesseract_version()
-            print("Tesseract configured successfully")
         except Exception as e:
-            print(f"Tesseract configuration warning: {e}")
     def _debug_print(self, label: str, data: any):
         if self.debug:
-            print(f"[PDF Parser] {label}")
             if isinstance(data, dict):
                 for key, val in data.items():
                     print(f"  {key}: {val}")
@@ -43,6 +54,7 @@ class PDFParser:
                 print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -52,10 +64,12 @@ class PDFParser:
         return {}
     def _save_processed_files(self):
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
@@ -63,6 +77,7 @@ class PDFParser:
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
@@ -81,31 +96,36 @@ class PDFParser:
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         images_data = []
         try:
             self._debug_print("Image Extraction Started", f"File: {pdf_path}")
             images = convert_from_path(pdf_path, dpi=150)
-            self._debug_print("PDF to Images", f"Total images: {len(images)}")
             for idx, image in enumerate(images):
                 self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
-                self._debug_print(f"Image {idx} OCR")
                 try:
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
-                        self._debug_print(f"Image {idx} OCR Result", f"EMPTY or very short ({len(ocr_text)} chars)")
                     else:
-                        self._debug_print(f"Image {idx} OCR Result", f"Success - {len(ocr_text)} chars: {ocr_text[:150]}")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
@@ -124,6 +144,7 @@ class PDFParser:
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
@@ -156,22 +177,26 @@ class PDFParser:
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
-        self._debug_print("PDF Parsing Started", f"File: {doc_id}")
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
-                self._debug_print("Status", f"File {doc_id} already processed")
                 return self._load_extracted_data(doc_id)
-        print(f"Processing PDF: {doc_id}")
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
         self._debug_print("Extraction Summary", {
             'text_length': len(text),
             'images_count': len(images),
@@ -179,14 +204,17 @@ class PDFParser:
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
         self._save_extracted_data(doc_id, text, images, tables)
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
         data = {
             'text': text,
             'images': images,
@@ -199,6 +227,7 @@ class PDFParser:
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
@@ -208,6 +237,7 @@ class PDFParser:
             return "", [], []
     def get_all_documents(self) -> Dict:
         all_docs = {}
         for json_file in self.docstore_path.glob("*_data.json"):
             doc_id = json_file.stem.replace("_data", "")

+"""
+PDF Parser Module with FIXED Russian OCR support
+"""
 import os
 import json
 import hashlib
         self.processed_files = self._load_processed_files()
         self.debug = debug
+        # Configure Tesseract for Russian + English
         self._configure_tesseract()
         if self.debug:
+            print("✅ PDFParser initialized with Russian OCR support")
     def _configure_tesseract(self):
+        """Configure Tesseract with proper paths and language support"""
         try:
+            # Windows specific path
+            if os.name == 'nt':
+                pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+            # Test Tesseract
             pytesseract.get_tesseract_version()
+            print("✅ Tesseract configured successfully")
         except Exception as e:
+            print(f"⚠️  Tesseract configuration warning: {e}")
     def _debug_print(self, label: str, data: any):
+        """Print debug information"""
         if self.debug:
+            print(f"\n🔍 [PDF Parser] {label}")
             if isinstance(data, dict):
                 for key, val in data.items():
                     print(f"  {key}: {val}")
                 print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
+        """Load list of already processed files with their hashes"""
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
         return {}
     def _save_processed_files(self):
+        """Save processed files list to disk"""
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
+        """Generate hash of file to detect changes"""
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
+        """Extract text from PDF using PyPDF2"""
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
+        """Extract images from PDF pages with Russian OCR support"""
         images_data = []
         try:
             self._debug_print("Image Extraction Started", f"File: {pdf_path}")
             images = convert_from_path(pdf_path, dpi=150)
+            self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
             for idx, image in enumerate(images):
                 self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
+                # Save image
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
+                # Extract text using OCR with Russian support
+                self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
                 try:
+                    # CRITICAL: Use 'rus+eng' for Russian + English support
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
+                    # Clean up text
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
+                        self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
                     else:
+                        self._debug_print(f"Image {idx} OCR Result", f"✅ Success - {len(ocr_text)} chars: {ocr_text[:150]}")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
+        """Extract table content from PDF"""
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
+        """Parse PDF and extract text, images, and tables with debug output"""
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
+        self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
+        # Check if file was already processed
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
+                self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
                 return self._load_extracted_data(doc_id)
+        print(f"\n📄 Processing PDF: {doc_id}")
+        # Extract content
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
+        # Summary
         self._debug_print("Extraction Summary", {
             'text_length': len(text),
             'images_count': len(images),
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
+        # Save extracted data
         self._save_extracted_data(doc_id, text, images, tables)
+        # Update processed files log
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
+        """Save extracted data to docstore"""
         data = {
             'text': text,
             'images': images,
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
+        """Load previously extracted data from docstore"""
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
             return "", [], []
     def get_all_documents(self) -> Dict:
+        """Load all processed documents from docstore"""
         all_docs = {}
         for json_file in self.docstore_path.glob("*_data.json"):
             doc_id = json_file.stem.replace("_data", "")

src/rag_system.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import HumanMessage, SystemMessage
@@ -11,14 +16,21 @@ from config import (
 class VisualMultimodalRAG:
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
-            model_name="gpt-4o-mini",
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
@@ -28,11 +40,12 @@ class VisualMultimodalRAG:
         self.visual_summaries_log = []
         if self.debug:
-            print("VisualMultimodalRAG initialized")
     def _debug_print(self, label: str, data: any):
         if self.debug:
-            print(f"DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
                 print(f"  Content: {str(data)[:300]}...")
@@ -40,6 +53,7 @@ class VisualMultimodalRAG:
                 print(f"  {data}")
     def _image_to_base64(self, image_path: str) -> str:
         try:
             with open(image_path, 'rb') as image_file:
                 image_data = base64.b64encode(image_file.read()).decode('utf-8')
@@ -49,14 +63,28 @@ class VisualMultimodalRAG:
             return None
     def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
         if not os.path.exists(image_path):
             return f"[Image {image_idx}: File not found - {image_path}]"
         try:
             image_base64 = self._image_to_base64(image_path)
             if not image_base64:
                 return f"[Image {image_idx}: Could not convert to base64]"
             file_ext = Path(image_path).suffix.lower()
             media_type_map = {
                 '.jpg': 'image/jpeg',
@@ -67,8 +95,9 @@ class VisualMultimodalRAG:
             }
             media_type = media_type_map.get(file_ext, 'image/png')
-            print(f"Analyzing image {image_idx}...")
             message = HumanMessage(
                 content=[
                     {
@@ -79,44 +108,52 @@ class VisualMultimodalRAG:
                     },
                     {
                         "type": "text",
-                        "text": f"""You are assistant for analyzing and aggregating information. Analyze this image.
-Provide a visual analysis that includes:
-1. Main objects and element
-2. Data/Content - Any numbers, text, charts, graphs
-3. What this image is showing or representing
-4. Important patterns, trends, or information
-5. How image relates to document content
-Be brief and meaningful. Focus on visual information that cannot be extracted from text. Response on {self.language}.
 Analysis:"""
                     }
                 ],
             )
             response = self.llm.invoke([message])
             analysis = response.content.strip()
             if self.debug:
                 self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
-            print(f"Image {image_idx} analyzed successfully")
             return analysis
         except Exception as e:
             error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
-            print(f"Error analyzing image {image_idx}: {e}")
             return error_msg
     def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
         visual_analyses = []
         for idx, image in enumerate(images):
             image_path = image.get('path', '')
             if not image_path:
-                print(f"Image {idx}: No path provided")
                 continue
             visual_analysis = self.analyze_image_visually(image_path, idx)
             visual_analyses.append({
@@ -124,12 +161,15 @@ Analysis:"""
                 'image_index': idx,
                 'image_path': image_path,
                 'visual_analysis': visual_analysis,
-                'ocr_text': image.get('ocr_text', '')
             })
         return visual_analyses
     def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
         chunks = []
         text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
@@ -141,12 +181,12 @@ Analysis:"""
             try:
                 prompt = f"""Summarize this text chunk in {self.language}.
-Be brief and meaningful. Extract key points, facts, and main ideas.
 Text Chunk:
 {chunk}
-Summary:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
@@ -169,6 +209,9 @@ Summary:"""
         return chunks
     def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
         summaries = []
         for idx, table in enumerate(tables):
@@ -179,12 +222,12 @@ Summary:"""
             try:
                 prompt = f"""Analyze and summarize this table/structured data in {self.language}.
-Extract key insights, row/column meanings, and important figures. Be brief and meaningful.
 Table Content:
 {table_content}
-Summary:"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
@@ -214,9 +257,13 @@ Summary:"""
         vector_store,
         doc_id: str
     ) -> Dict:
         print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
         results = {
             'doc_id': doc_id,
@@ -226,13 +273,14 @@ Summary:"""
             'total_stored': 0
         }
-        print(f"VISUAL IMAGE ANALYSIS ({len(images)} total)")
         image_analyses = self.analyze_images_visually(images)
         results['image_visual_analyses'] = image_analyses
         image_docs = {
             'text': ' | '.join([
                 f"Image {a['image_index']}: {a['visual_analysis']}"
@@ -243,7 +291,7 @@ Summary:"""
         }
         for analysis in image_analyses:
-            print(f"   Image {analysis['image_index']} (visual analysis)")
             print(f"     Path: {analysis['image_path']}")
             print(f"     Analysis: {analysis['visual_analysis'][:100]}...")
@@ -254,11 +302,13 @@ Summary:"""
                     f"{doc_id}_images_visual"
                 )
                 results['total_stored'] += len(image_analyses)
-                print(f" Stored {len(image_analyses)} image visual analyses")
             except Exception as e:
-                print(f" Error storing image analyses: {e}")
-        print(f" TEXT CHUNK SUMMARIZATION")
         text_summaries = self.summarize_text_chunks(text)
         results['text_summaries'] = text_summaries
@@ -271,7 +321,7 @@ Summary:"""
         }
         for summary in text_summaries:
-            print(f"  Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
         if text_summaries:
             try:
@@ -280,11 +330,13 @@ Summary:"""
                     f"{doc_id}_text_chunks"
                 )
                 results['total_stored'] += len(text_summaries)
-                print(f" Stored {len(text_summaries)} text chunk summaries")
             except Exception as e:
-                print(f" Error storing text summaries: {e}")
-        print(f" TABLE SUMMARIZATION ({len(tables)} total)")
         table_summaries = self.summarize_tables(tables)
         results['table_summaries'] = table_summaries
@@ -297,7 +349,7 @@ Summary:"""
         }
         for summary in table_summaries:
-            print(f"   Table {summary['table_index']}: {summary['summary'][:50]}...")
         if table_summaries:
             try:
@@ -306,15 +358,19 @@ Summary:"""
                     f"{doc_id}_tables"
                 )
                 results['total_stored'] += len(table_summaries)
-                print(f" Stored {len(table_summaries)} table summaries")
             except Exception as e:
-                print(f" Error storing table summaries: {e}")
-        print(f" STORAGE SUMMARY")
         print(f"  Images analyzed visually & stored: {len(image_analyses)}")
         print(f"  Text chunks summarized & stored: {len(text_summaries)}")
         print(f"  Tables summarized & stored: {len(table_summaries)}")
         print(f"  Total items stored in vector: {results['total_stored']}")
         self.visual_summaries_log.append(results)
         return results
@@ -335,13 +391,19 @@ Summary:"""
 class AnsweringRAG:
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
-            model_name="gpt-4o-mini",
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
@@ -351,11 +413,12 @@ class AnsweringRAG:
         self.answer_log = []
         if self.debug:
-            print("AnsweringRAG initialized ")
     def _debug_print(self, label: str, data: any):
         if self.debug:
-            print(f" DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
                 print(f"  Content: {str(data)[:300]}...")
@@ -367,17 +430,35 @@ class AnsweringRAG:
         question: str,
         search_results: List[Dict]
     ) -> Dict:
         print(f"ANALYZING QUESTION & GENERATING ANSWER")
-        print(f"Question: {question}")
-        print(f"Search Results Found: {len(search_results)}")
         if not search_results:
-            print(f"No search results found!")
-            answer = f"""No relevant information in the document to answer your question: "{question}"
-"""
             result = {
                 'question': question,
@@ -389,6 +470,7 @@ class AnsweringRAG:
             self.answer_log.append(result)
             return result
         context_parts = []
         for idx, result in enumerate(search_results, 1):
             content = result.get('content', '')
@@ -405,6 +487,7 @@ class AnsweringRAG:
         self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
         analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
 USER QUESTION:
@@ -420,20 +503,23 @@ INSTRUCTIONS:
 4. If the content doesn't fully answer the question, explain what information is available
 5. Be specific and cite the content when relevant
 6. Structure your answer clearly with key points
 ANSWER:"""
-        print(f"Analyzing search results...")
         print(f"   Context size: {len(full_context)} characters")
         print(f"   Sources: {len(search_results)}")
         try:
             message = HumanMessage(content=analysis_prompt)
             response = self.llm.invoke([message])
             answer = response.content.strip()
             confidence = self._estimate_confidence(len(search_results), answer)
-            print(f" Answer generated successfully")
             print(f"   Confidence: {confidence}")
             print(f"   Answer length: {len(answer)} characters")
@@ -449,7 +535,7 @@ ANSWER:"""
             return result
         except Exception as e:
-            print(f" Error generating answer: {e}")
             answer = f"I encountered an error while analyzing the search results. Please try again."
             result = {
@@ -465,14 +551,18 @@ ANSWER:"""
             return result
     def _estimate_confidence(self, sources_count: int, answer: str) -> str:
         answer_length = len(answer)
         if sources_count >= 3 and answer_length > 500:
             return "high"
         elif sources_count >= 2 and answer_length > 200:
             return "medium"
         else:
             return "low"
@@ -481,9 +571,14 @@ ANSWER:"""
         question: str,
         search_results: List[Dict]
     ) -> Dict:
         result = self.analyze_and_answer(question, search_results)
         formatted_sources = []
         for idx, source in enumerate(result['search_results'], 1):
             formatted_sources.append({
@@ -497,18 +592,25 @@ ANSWER:"""
         return result
     def get_answer_log(self) -> List[Dict]:
         return self.answer_log
     def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
         print(f"ANSWER TO: {result['question']}")
-        print(f"ANSWER (Confidence: {result['confidence'].upper()}):")
         print(result['answer'])
         if result.get('formatted_sources'):
-            print(f"SOURCES USED ({len(result['formatted_sources'])} total):")
             for source in result['formatted_sources']:
                 print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
                 print(f"{source['content'][:max_source_length]}...")

+"""
+Enhanced RAG System - Visual Image Analysis
+Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
+Then stores results in vector store
+"""
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import HumanMessage, SystemMessage
 class VisualMultimodalRAG:
+    """
+    RAG system that:
+    1. Sends images as base64 to GPT-4o for visual analysis
+    2. Gets detailed visual descriptions and insights
+    3. Stores visual analysis in vector store
+    4. Enables image-based semantic search
+    """
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
+        # Use gpt-4o for vision capabilities
         self.llm = ChatOpenAI(
+            model_name="gpt-4o-mini",  # CRITICAL: gpt-4o has vision
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         self.visual_summaries_log = []
         if self.debug:
+            print("✅ VisualMultimodalRAG initialized with gpt-4o (vision model)")
     def _debug_print(self, label: str, data: any):
+        """Print debug information"""
         if self.debug:
+            print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
                 print(f"  Content: {str(data)[:300]}...")
                 print(f"  {data}")
     def _image_to_base64(self, image_path: str) -> str:
+        """Convert image file to base64 string"""
         try:
             with open(image_path, 'rb') as image_file:
                 image_data = base64.b64encode(image_file.read()).decode('utf-8')
             return None
     def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
+        """
+        Send actual image (base64) to gpt-4o for visual analysis
+        Returns detailed visual analysis/description
+        gpt-4o can see:
+        - Charts, graphs, diagrams
+        - Tables and structured data
+        - Photos and drawings
+        - Handwritten text
+        - Screenshots
+        - Any visual content
+        """
         if not os.path.exists(image_path):
             return f"[Image {image_idx}: File not found - {image_path}]"
         try:
+            # Convert image to base64
             image_base64 = self._image_to_base64(image_path)
             if not image_base64:
                 return f"[Image {image_idx}: Could not convert to base64]"
+            # Determine image type
             file_ext = Path(image_path).suffix.lower()
             media_type_map = {
                 '.jpg': 'image/jpeg',
             }
             media_type = media_type_map.get(file_ext, 'image/png')
+            print(f"🔍 Analyzing image {image_idx} visually (as {media_type})...")
+            # Create message with image
             message = HumanMessage(
                 content=[
                     {
                     },
                     {
                         "type": "text",
+                        "text": f"""Analyze this image in detail in {self.language}.
+Provide a comprehensive visual analysis including:
+1. **What you see** - Main objects, elements, structure
+2. **Data/Content** - Any numbers, text, charts, graphs
+3. **Purpose** - What this image is showing or representing
+4. **Key insights** - Important patterns, trends, or information
+5. **Connections** - How this relates to document content
+Be specific and detailed. Focus on visual information that cannot be extracted from text alone.
 Analysis:"""
                     }
                 ],
             )
+            # Call gpt-4o with vision
             response = self.llm.invoke([message])
             analysis = response.content.strip()
             if self.debug:
                 self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
+            print(f"✅ Image {image_idx} analyzed successfully")
             return analysis
         except Exception as e:
             error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
+            print(f"❌ Error analyzing image {image_idx}: {e}")
             return error_msg
     def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
+        """
+        Analyze each image visually using gpt-4o vision
+        Returns list of {image_index, visual_analysis, type}
+        """
         visual_analyses = []
         for idx, image in enumerate(images):
             image_path = image.get('path', '')
             if not image_path:
+                print(f"⚠️  Image {idx}: No path provided")
                 continue
+            # Analyze image visually (not just OCR)
             visual_analysis = self.analyze_image_visually(image_path, idx)
             visual_analyses.append({
                 'image_index': idx,
                 'image_path': image_path,
                 'visual_analysis': visual_analysis,
+                'ocr_text': image.get('ocr_text', '')  # Keep OCR as backup
             })
         return visual_analyses
     def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
+        """
+        Chunk text and summarize each chunk individually
+        """
         chunks = []
         text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
             try:
                 prompt = f"""Summarize this text chunk in {self.language}.
+Keep it concise. Extract key points, facts, and main ideas.
 Text Chunk:
 {chunk}
+Summary (2-3 sentences maximum):"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
         return chunks
     def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
+        """
+        Summarize each table individually
+        """
         summaries = []
         for idx, table in enumerate(tables):
             try:
                 prompt = f"""Analyze and summarize this table/structured data in {self.language}.
+Extract key insights, row/column meanings, and important figures.
 Table Content:
 {table_content}
+Summary (2-3 sentences maximum):"""
                 message = HumanMessage(content=prompt)
                 response = self.llm.invoke([message])
         vector_store,
         doc_id: str
     ) -> Dict:
+        """
+        Main function: Analyze all components visually and store in vector store
+        Images are analyzed using gpt-4o vision (not just OCR)
+        """
+        print(f"\n{'='*70}")
         print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
+        print(f"{'='*70}")
         results = {
             'doc_id': doc_id,
             'total_stored': 0
         }
+        # 1. Analyze images VISUALLY using gpt-4o
+        print(f"\n🖼️ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
+        print(f"{'─'*70}")
         image_analyses = self.analyze_images_visually(images)
         results['image_visual_analyses'] = image_analyses
+        # Store each image analysis in vector store
         image_docs = {
             'text': ' | '.join([
                 f"Image {a['image_index']}: {a['visual_analysis']}"
         }
         for analysis in image_analyses:
+            print(f"  ✅ Image {analysis['image_index']} (visual analysis)")
             print(f"     Path: {analysis['image_path']}")
             print(f"     Analysis: {analysis['visual_analysis'][:100]}...")
                     f"{doc_id}_images_visual"
                 )
                 results['total_stored'] += len(image_analyses)
+                print(f"✅ Stored {len(image_analyses)} image visual analyses")
             except Exception as e:
+                print(f"❌ Error storing image analyses: {e}")
+        # 2. Summarize and store text chunks
+        print(f"\n📝 TEXT CHUNK SUMMARIZATION")
+        print(f"{'─'*70}")
         text_summaries = self.summarize_text_chunks(text)
         results['text_summaries'] = text_summaries
         }
         for summary in text_summaries:
+            print(f"  ✅ Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
         if text_summaries:
             try:
                     f"{doc_id}_text_chunks"
                 )
                 results['total_stored'] += len(text_summaries)
+                print(f"✅ Stored {len(text_summaries)} text chunk summaries")
             except Exception as e:
+                print(f"❌ Error storing text summaries: {e}")
+        # 3. Summarize and store tables
+        print(f"\n📋 TABLE SUMMARIZATION ({len(tables)} total)")
+        print(f"{'─'*70}")
         table_summaries = self.summarize_tables(tables)
         results['table_summaries'] = table_summaries
         }
         for summary in table_summaries:
+            print(f"  ✅ Table {summary['table_index']}: {summary['summary'][:50]}...")
         if table_summaries:
             try:
                     f"{doc_id}_tables"
                 )
                 results['total_stored'] += len(table_summaries)
+                print(f"✅ Stored {len(table_summaries)} table summaries")
             except Exception as e:
+                print(f"❌ Error storing table summaries: {e}")
+        # 4. Summary statistics
+        print(f"\n{'='*70}")
+        print(f"📊 STORAGE SUMMARY")
+        print(f"{'='*70}")
         print(f"  Images analyzed visually & stored: {len(image_analyses)}")
         print(f"  Text chunks summarized & stored: {len(text_summaries)}")
         print(f"  Tables summarized & stored: {len(table_summaries)}")
         print(f"  Total items stored in vector: {results['total_stored']}")
+        print(f"{'='*70}")
         self.visual_summaries_log.append(results)
         return results
 class AnsweringRAG:
+    """
+    RAG system that:
+    1. Searches vector store for relevant content
+    2. ANALYZES search results
+    3. Generates intelligent answers based on context
+    """
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.debug = debug
         self.llm = ChatOpenAI(
+            model_name="gpt-4o-mini",  # Use gpt-4o for better understanding
             api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         self.answer_log = []
         if self.debug:
+            print("✅ AnsweringRAG initialized with answer generation")
     def _debug_print(self, label: str, data: any):
+        """Print debug information"""
         if self.debug:
+            print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
                 print(f"  Content: {str(data)[:300]}...")
         question: str,
         search_results: List[Dict]
     ) -> Dict:
+        """
+        Analyze search results and generate intelligent answer
+        Returns:
+        {
+            'question': user question,
+            'answer': detailed answer,
+            'sources_used': number of sources,
+            'confidence': low/medium/high,
+            'search_results': original search results
+        }
+        """
+        print(f"\n{'='*70}")
         print(f"ANALYZING QUESTION & GENERATING ANSWER")
+        print(f"{'='*70}")
+        print(f"\n❓ Question: {question}")
+        print(f"📊 Search Results Found: {len(search_results)}")
+        # Check if we have search results
         if not search_results:
+            print(f"⚠️  No search results found!")
+            answer = f"""I could not find relevant information in the document to answer your question: "{question}"
+Try:
+- Using different keywords
+- Breaking the question into smaller parts
+- Asking about other topics in the document"""
             result = {
                 'question': question,
             self.answer_log.append(result)
             return result
+        # Build context from search results
         context_parts = []
         for idx, result in enumerate(search_results, 1):
             content = result.get('content', '')
         self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
+        # Build prompt to analyze results and answer question
         analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
 USER QUESTION:
 4. If the content doesn't fully answer the question, explain what information is available
 5. Be specific and cite the content when relevant
 6. Structure your answer clearly with key points
 ANSWER:"""
+        print(f"\n🔍 Analyzing search results...")
         print(f"   Context size: {len(full_context)} characters")
         print(f"   Sources: {len(search_results)}")
         try:
+            # Call LLM to analyze and answer
             message = HumanMessage(content=analysis_prompt)
             response = self.llm.invoke([message])
             answer = response.content.strip()
+            # Determine confidence level
             confidence = self._estimate_confidence(len(search_results), answer)
+            print(f"✅ Answer generated successfully")
             print(f"   Confidence: {confidence}")
             print(f"   Answer length: {len(answer)} characters")
             return result
         except Exception as e:
+            print(f"❌ Error generating answer: {e}")
             answer = f"I encountered an error while analyzing the search results. Please try again."
             result = {
             return result
     def _estimate_confidence(self, sources_count: int, answer: str) -> str:
+        """Estimate confidence level of answer"""
         answer_length = len(answer)
+        # High confidence: multiple sources, substantial answer
         if sources_count >= 3 and answer_length > 500:
             return "high"
+        # Medium confidence: some sources, decent answer
         elif sources_count >= 2 and answer_length > 200:
             return "medium"
+        # Low confidence: few sources or short answer
         else:
             return "low"
         question: str,
         search_results: List[Dict]
     ) -> Dict:
+        """
+        Get answer AND properly formatted sources
+        Returns both answer and formatted source citations
+        """
         result = self.analyze_and_answer(question, search_results)
+        # Format sources for display
         formatted_sources = []
         for idx, source in enumerate(result['search_results'], 1):
             formatted_sources.append({
         return result
     def get_answer_log(self) -> List[Dict]:
+        """Get all answer generation logs"""
         return self.answer_log
     def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
+        """Pretty print answer with sources"""
+        print(f"\n{'='*70}")
         print(f"ANSWER TO: {result['question']}")
+        print(f"{'='*70}")
+        print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
+        print(f"{'-'*70}")
         print(result['answer'])
+        print(f"{'-'*70}")
         if result.get('formatted_sources'):
+            print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
             for source in result['formatted_sources']:
                 print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
                 print(f"{source['content'][:max_source_length]}...")
+        print(f"\n{'='*70}")

src/vector_store.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import os
 import json
 from typing import List, Dict
@@ -8,12 +12,14 @@ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
 class CLIPEmbedder:
     def __init__(self, model_name: str = EMBEDDING_MODEL):
-        print(f" Loading embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
-        print(f" Model loaded successfully")
     def embed(self, text: str) -> List[float]:
         try:
             embedding = self.model.encode(text, convert_to_numpy=False)
             return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
@@ -22,6 +28,7 @@ class CLIPEmbedder:
             return [0.0] * EMBEDDING_DIM
     def embed_batch(self, texts: List[str]) -> List[List[float]]:
         try:
             embeddings = self.model.encode(texts, convert_to_numpy=False)
             return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
@@ -31,30 +38,34 @@ class CLIPEmbedder:
 class VectorStore:
     def __init__(self):
         self.persist_directory = CHROMA_DB_PATH
         self.embedder = CLIPEmbedder()
-        print(f" Initializing ChromaDB at: {self.persist_directory}")
         try:
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
-            print(f" ChromaDB initialized")
         except Exception as e:
-            print(f" Error initializing ChromaDB: {e}")
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
         try:
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
             count = self.collection.count()
-            print(f" Collection loaded: {count} items in store")
         except Exception as e:
             print(f"Error with collection: {e}")
             self.collection = self.client.get_or_create_collection(
@@ -62,12 +73,14 @@ class VectorStore:
             )
     def add_documents(self, documents: List[Dict], doc_id: str):
         texts = []
         metadatas = []
         ids = []
-        print(f" Adding documents for: {doc_id}")
         if 'text' in documents and documents['text']:
             chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
             for idx, chunk in enumerate(chunks):
@@ -78,8 +91,9 @@ class VectorStore:
                     'chunk_idx': str(idx)
                 })
                 ids.append(f"{doc_id}_text_{idx}")
-            print(f"   Text: {len(chunks)} chunks")
         if 'images' in documents:
             image_count = 0
             for idx, image_data in enumerate(documents['images']):
@@ -94,8 +108,9 @@ class VectorStore:
                     ids.append(f"{doc_id}_image_{idx}")
                     image_count += 1
             if image_count > 0:
-                print(f"   Images: {image_count} with OCR text")
         if 'tables' in documents:
             table_count = 0
             for idx, table_data in enumerate(documents['tables']):
@@ -109,12 +124,14 @@ class VectorStore:
                     ids.append(f"{doc_id}_table_{idx}")
                     table_count += 1
             if table_count > 0:
-                print(f"   Tables: {table_count}")
         if texts:
-            print(f"   Generating {len(texts)} embeddings...")
             embeddings = self.embedder.embed_batch(texts)
             try:
                 self.collection.add(
                     ids=ids,
@@ -122,10 +139,11 @@ class VectorStore:
                     embeddings=embeddings,
                     metadatas=metadatas
                 )
-                print(f" Successfully added {len(texts)} items to vector store")
-                print(f" Data persisted automatically to: {self.persist_directory}")
             except Exception as e:
-                print(f" Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
         """Search vector store for similar documents"""
@@ -137,6 +155,7 @@ class VectorStore:
                 n_results=n_results
             )
             formatted_results = []
             if results['documents']:
                 for i, doc in enumerate(results['documents'][0]):
@@ -156,6 +175,7 @@ class VectorStore:
             return []
     def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
         chunks = []
         start = 0
         while start < len(text):
@@ -165,6 +185,7 @@ class VectorStore:
         return chunks
     def get_collection_info(self) -> Dict:
         try:
             count = self.collection.count()
             return {
@@ -178,25 +199,35 @@ class VectorStore:
             return {'status': 'error', 'message': str(e)}
     def delete_by_doc_id(self, doc_id: str):
         try:
             results = self.collection.get(where={'doc_id': doc_id})
             if results['ids']:
                 self.collection.delete(ids=results['ids'])
-                print(f" Deleted {len(results['ids'])} documents for {doc_id}")
         except Exception as e:
             print(f"Error deleting documents: {e}")
     def persist(self):
-        print(" Vector store is using auto-persist")
     def clear_all(self):
         try:
             self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
-            print(" Collection cleared and reset")
         except Exception as e:
             print(f"Error clearing collection: {e}")

+"""
+Vector Store and Embeddings Module using ChromaDB with sentence-transformers
+UPDATED for ChromaDB v0.4.22+ (auto-persist, no manual persist needed)
+"""
 import os
 import json
 from typing import List, Dict
 class CLIPEmbedder:
+    """Custom embedder using sentence-transformers for multimodal content"""
     def __init__(self, model_name: str = EMBEDDING_MODEL):
+        print(f"🔄 Loading embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
+        print(f"✅ Model loaded successfully")
     def embed(self, text: str) -> List[float]:
+        """Generate embedding for text"""
         try:
             embedding = self.model.encode(text, convert_to_numpy=False)
             return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
             return [0.0] * EMBEDDING_DIM
     def embed_batch(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for batch of texts"""
         try:
             embeddings = self.model.encode(texts, convert_to_numpy=False)
             return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
 class VectorStore:
+    """Vector store manager using ChromaDB (v0.4.22+ with auto-persist)"""
     def __init__(self):
         self.persist_directory = CHROMA_DB_PATH
         self.embedder = CLIPEmbedder()
+        print(f"\n🔄 Initializing ChromaDB at: {self.persist_directory}")
+        # NEW ChromaDB v0.4.22+ - PersistentClient auto-persists
         try:
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
+            print(f"✅ ChromaDB PersistentClient initialized")
         except Exception as e:
+            print(f"❌ Error initializing ChromaDB: {e}")
+            print(f"Trying fallback initialization...")
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
             )
+        # Get or create collection
         try:
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
             count = self.collection.count()
+            print(f"✅ Collection loaded: {count} items in store")
         except Exception as e:
             print(f"Error with collection: {e}")
             self.collection = self.client.get_or_create_collection(
             )
     def add_documents(self, documents: List[Dict], doc_id: str):
+        """Add documents to vector store"""
         texts = []
         metadatas = []
         ids = []
+        print(f"\n📚 Adding documents for: {doc_id}")
+        # Add text chunks
         if 'text' in documents and documents['text']:
             chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
             for idx, chunk in enumerate(chunks):
                     'chunk_idx': str(idx)
                 })
                 ids.append(f"{doc_id}_text_{idx}")
+            print(f"  ✅ Text: {len(chunks)} chunks")
+        # Add image descriptions and OCR text
         if 'images' in documents:
             image_count = 0
             for idx, image_data in enumerate(documents['images']):
                     ids.append(f"{doc_id}_image_{idx}")
                     image_count += 1
             if image_count > 0:
+                print(f"  ✅ Images: {image_count} with OCR text")
+        # Add table content
         if 'tables' in documents:
             table_count = 0
             for idx, table_data in enumerate(documents['tables']):
                     ids.append(f"{doc_id}_table_{idx}")
                     table_count += 1
             if table_count > 0:
+                print(f"  ✅ Tables: {table_count}")
         if texts:
+            # Generate embeddings
+            print(f"  🔄 Generating {len(texts)} embeddings...")
             embeddings = self.embedder.embed_batch(texts)
+            # Add to collection
             try:
                 self.collection.add(
                     ids=ids,
                     embeddings=embeddings,
                     metadatas=metadatas
                 )
+                print(f"✅ Successfully added {len(texts)} items to vector store")
+                # Auto-persist happens here
+                print(f"✅ Data persisted automatically to: {self.persist_directory}")
             except Exception as e:
+                print(f"❌ Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
         """Search vector store for similar documents"""
                 n_results=n_results
             )
+            # Format results
             formatted_results = []
             if results['documents']:
                 for i, doc in enumerate(results['documents'][0]):
             return []
     def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
+        """Split text into chunks with overlap"""
         chunks = []
         start = 0
         while start < len(text):
         return chunks
     def get_collection_info(self) -> Dict:
+        """Get information about the collection"""
         try:
             count = self.collection.count()
             return {
             return {'status': 'error', 'message': str(e)}
     def delete_by_doc_id(self, doc_id: str):
+        """Delete all documents related to a specific doc_id"""
         try:
+            # Get all IDs with this doc_id
             results = self.collection.get(where={'doc_id': doc_id})
             if results['ids']:
                 self.collection.delete(ids=results['ids'])
+                print(f"✅ Deleted {len(results['ids'])} documents for {doc_id}")
+                # Auto-persist on delete
+                print(f"✅ Changes persisted automatically")
         except Exception as e:
             print(f"Error deleting documents: {e}")
     def persist(self):
+        """
+        No-op for compatibility with older code.
+        ChromaDB v0.4.22+ uses PersistentClient which auto-persists.
+        This method kept for backward compatibility.
+        """
+        print("✅ Vector store is using auto-persist (no manual persist needed)")
     def clear_all(self):
+        """Clear all documents from collection"""
         try:
+            # Delete collection and recreate
             self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
+            print("✅ Collection cleared and reset")
         except Exception as e:
             print(f"Error clearing collection: {e}")