final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 18, 2025

Commit

4f6cdd1

verified ·

1 Parent(s): a6e26b8

Update src/rag_system.py

Browse files

Files changed (1) hide show

src/rag_system.py +264 -320

src/rag_system.py CHANGED Viewed

@@ -1,16 +1,24 @@
 """
-LLM Integration Module using OpenAI GPT-4o and LangChain
-FIXED for LangChain 0.1+ with IMAGE DEBUGGING + RESULT LOGGING
 """
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
-from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
-import os
-from config import OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS, LANGUAGE
 class MultimodalRAG:
-    """RAG system with multimodal support using LangChain and OpenAI"""
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
@@ -25,10 +33,10 @@ class MultimodalRAG:
         self.conversation_history = []
         self.language = LANGUAGE
-        self.summarization_log = []  # Track all summarizations
         if self.debug:
-            print("✅ MultimodalRAG initialized with DEBUG mode ON")
     def _debug_print(self, label: str, data: any):
         """Print debug information"""
@@ -36,354 +44,290 @@ class MultimodalRAG:
             print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
-                print(f"  Content: {str(data)[:500]}...")
             else:
                 print(f"  {data}")
-    def _build_context_prompt(self, search_results: List[Dict]) -> str:
-        """Build context from search results with debug info"""
-        context = "Based on the following document content:\n\n"
-        self._debug_print("Search Results Count", len(search_results))
-        # Separate by type for debugging
-        text_count = 0
-        image_count = 0
-        table_count = 0
-        for idx, result in enumerate(search_results, 1):
-            content_type = result.get('type', 'unknown')
-            content = result.get('content', '')
-            distance = result.get('distance', 0)
-            # Track counts
-            if content_type == 'image':
-                image_count += 1
-            elif content_type == 'table':
-                table_count += 1
-            else:
-                text_count += 1
-            self._debug_print(
-                f"Result {idx}: Type={content_type}, Distance={distance:.3f}, Length={len(content)}",
-                content[:100]
-            )
-            if content_type == 'image':
-                context += f"[Image {idx}] {content}\n\n"
-            elif content_type == 'table':
-                context += f"[Table {idx}] {content}\n\n"
-            else:
-                context += f"[Text {idx}] {content}\n\n"
-        self._debug_print("Context Composition",
-            f"Text: {text_count}, Images: {image_count}, Tables: {table_count}")
-        self._debug_print("Total Context Length", len(context))
-        return context
-    def answer_question(self, question: str, search_results: List[Dict], streaming: bool = False) -> str:
-        """Generate answer to user question based on search results"""
-        try:
-            self._debug_print("Question", question)
-            # Build context from search results
-            context = self._build_context_prompt(search_results)
-            # Create system message
-            system_message = SystemMessage(
-                content=f"""You are a helpful assistant that answers questions about documents.
-                You work with documents that contain text, tables, and images.
-                Language: {self.language}
-                Provide accurate, concise answers based on the provided context.
-                If information is not found in the context, say so clearly.
-                For tables and images, provide detailed analysis when relevant."""
-            )
-            # Create user message with context
-            user_message = HumanMessage(
-                content=f"{context}\n\nQuestion: {question}\n\nPlease answer based on the context above."
-            )
-            self._debug_print("User Message Length", len(user_message.content))
-            # Add to conversation history
-            self.conversation_history.append(user_message)
-            # Get response using .invoke() instead of calling object directly
-            self._debug_print("Calling LLM", f"Model: {OPENAI_MODEL}")
-            response = self.llm.invoke([system_message] + self.conversation_history)
-            # Add response to history
-            self.conversation_history.append(response)
-            self._debug_print("Response Length", len(response.content))
-            # Keep conversation history manageable (last 10 messages)
-            if len(self.conversation_history) > 10:
-                self.conversation_history = self.conversation_history[-10:]
-            return response.content
-        except Exception as e:
-            self._debug_print("ERROR in answer_question", str(e))
-            print(f"Error generating answer: {e}")
-            return f"Error: Could not generate answer. {str(e)}"
-    def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
-        """Summarize extracted document content including images and tables with logging"""
-        try:
-            if images is None:
-                images = []
-            if tables is None:
-                tables = []
-            self._debug_print("Document Summarization Started",
-                f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
-            # Log entry
-            log_entry = {
-                'document_text_length': len(document_content),
-                'total_images': len(images),
-                'total_tables': len(tables),
-                'images_with_ocr': 0,
-                'images_empty_ocr': 0,
-                'ocr_texts': [],
-                'table_texts': [],
-                'summary_prompt_length': 0,
-                'summary_result': '',
-                'summary_result_length': 0
-            }
-            # Extract OCR text from images
-            image_ocr_texts = []
-            for idx, img in enumerate(images):
-                ocr_text = img.get('ocr_text', '')
-                if ocr_text:
-                    image_ocr_texts.append(f"Image {idx}: {ocr_text}")
-                    log_entry['images_with_ocr'] += 1
-                    log_entry['ocr_texts'].append({
-                        'image_index': idx,
-                        'ocr_length': len(ocr_text),
-                        'ocr_content': ocr_text[:200]  # First 200 chars
-                    })
-                    self._debug_print(f"Image {idx} OCR", ocr_text[:100])
-                else:
-                    log_entry['images_empty_ocr'] += 1
-                    log_entry['ocr_texts'].append({
-                        'image_index': idx,
-                        'ocr_length': 0,
-                        'ocr_content': 'EMPTY'
-                    })
-                    self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
-            # Extract table content
-            table_texts = []
-            for idx, tbl in enumerate(tables):
-                table_content = tbl.get('content', '')
-                if table_content:
-                    table_texts.append(f"Table {idx}:\n{table_content}")
-                    log_entry['table_texts'].append({
-                        'table_index': idx,
-                        'table_length': len(table_content),
-                        'table_content': table_content[:200]
-                    })
-                    self._debug_print(f"Table {idx} Content", table_content[:100])
-                else:
-                    self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
-            # Build comprehensive summary prompt
-            summary_prompt = f"""Please provide a comprehensive summary of the following document content in {self.language}.
-Document Text:
-{document_content}
-"""
-            # Add images if they have OCR text
-            if image_ocr_texts:
-                summary_prompt += f"\nExtracted text from {len(images)} images:\n"
-                summary_prompt += "\n".join(image_ocr_texts)
-                summary_prompt += "\n"
-            # Add tables
-            if table_texts:
-                summary_prompt += f"\nDocument contains {len(tables)} tables:\n"
-                summary_prompt += "\n".join(table_texts)
-                summary_prompt += "\n"
-            summary_prompt += f"""
-Please include in your summary:
-1. Main topics covered
-2. Key points and findings
-3. Important data and numbers
-4. Key information from images (if present)
-5. Key information from tables (if present)
-6. Overall document purpose"""
-            log_entry['summary_prompt_length'] = len(summary_prompt)
-            self._debug_print("Summary Prompt Length", len(summary_prompt))
-            self._debug_print("Summary Prompt Content", summary_prompt[:200])
-            message = HumanMessage(content=summary_prompt)
-            self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
-            response = self.llm.invoke([message])
-            summary = response.content
-            log_entry['summary_result'] = summary
-            log_entry['summary_result_length'] = len(summary)
-            self._debug_print("Summary Response Length", len(summary))
-            # PRINT DETAILED SUMMARIZATION LOG
-            self._print_summarization_log(log_entry)
-            # Store in log
-            self.summarization_log.append(log_entry)
-            return summary
-        except Exception as e:
-            self._debug_print("ERROR in summarize_document", str(e))
-            print(f"Error summarizing document: {e}")
-            return f"Error: Could not summarize document. {str(e)}"
-    def _print_summarization_log(self, log_entry: Dict):
-        """Print detailed summarization results log"""
-        print("\n" + "="*70)
-        print("📊 IMAGE SUMMARIZATION LOG")
-        print("="*70)
-        # Document composition
-        print("\n📄 DOCUMENT COMPOSITION:")
-        print(f"  Text: {log_entry['document_text_length']:,} characters")
-        print(f"  Images: {log_entry['total_images']} total")
-        print(f"    ✅ With OCR text: {log_entry['images_with_ocr']}")
-        print(f"    ⚠️  Empty OCR: {log_entry['images_empty_ocr']}")
-        print(f"  Tables: {log_entry['total_tables']} total")
-        # Image OCR details
-        if log_entry['ocr_texts']:
-            print("\n🖼️  IMAGE OCR TEXT DETAILS:")
-            for ocr in log_entry['ocr_texts']:
-                idx = ocr['image_index']
-                length = ocr['ocr_length']
-                content = ocr['ocr_content']
-                if length == 0:
-                    print(f"  Image {idx}: ⚠️  EMPTY (0 chars)")
-                else:
-                    print(f"  Image {idx}: ✅ {length} characters")
-                    print(f"    Content: {content}...")
-        # Table details
-        if log_entry['table_texts']:
-            print("\n📋 TABLE DETAILS:")
-            for tbl in log_entry['table_texts']:
-                idx = tbl['table_index']
-                length = tbl['table_length']
-                content = tbl['table_content']
-                print(f"  Table {idx}: {length} characters")
-                print(f"    Content: {content}...")
-        # Prompt details
-        print("\n📝 SUMMARIZATION PROMPT:")
-        print(f"  Total length: {log_entry['summary_prompt_length']:,} characters")
-        print(f"  Includes images: {'✅ Yes' if log_entry['ocr_texts'] else '❌ No'}")
-        print(f"  Includes tables: {'✅ Yes' if log_entry['table_texts'] else '❌ No'}")
-        # Summary result
-        print("\n✨ SUMMARY RESULT:")
-        print(f"  Length: {log_entry['summary_result_length']:,} characters")
-        print(f"  Content:")
-        print("  " + "-"*66)
-        # Print summary with line wrapping
-        summary_lines = log_entry['summary_result'].split('\n')
-        for line in summary_lines[:15]:  # First 15 lines
-            print(f"  {line}")
-        if len(summary_lines) > 15:
-            print(f"  ... ({len(summary_lines) - 15} more lines)")
-        print("  " + "-"*66)
-        print("\n" + "="*70)
-    def get_summarization_log(self) -> List[Dict]:
-        """Get all summarization logs"""
-        return self.summarization_log
-    def print_summarization_history(self):
-        """Print all summarization logs"""
-        print("\n📚 SUMMARIZATION HISTORY:")
-        print(f"Total summarizations: {len(self.summarization_log)}")
-        for idx, log in enumerate(self.summarization_log, 1):
-            print(f"\n{'='*70}")
-            print(f"Summarization #{idx}")
-            print(f"{'='*70}")
-            self._print_summarization_log(log)
-    def debug_search_results(self, search_results: List[Dict]) -> Dict:
-        """Detailed analysis of search results for debugging"""
-        analysis = {
-            'total_results': len(search_results),
-            'by_type': {'text': 0, 'image': 0, 'table': 0},
-            'average_distance': 0,
-            'images_with_content': 0,
-            'images_empty': 0,
-            'details': []
         }
-        distances = []
-        for idx, result in enumerate(search_results):
-            content_type = result.get('type', 'unknown')
-            content = result.get('content', '')
-            distance = result.get('distance', 0)
-            if content_type in analysis['by_type']:
-                analysis['by_type'][content_type] += 1
-            distances.append(distance)
-            # Track image specifics
-            if content_type == 'image':
-                if content.strip():
-                    analysis['images_with_content'] += 1
-                else:
-                    analysis['images_empty'] += 1
-            analysis['details'].append({
-                'index': idx,
-                'type': content_type,
-                'distance': distance,
-                'content_length': len(content),
-                'has_content': bool(content.strip())
-            })
-        if distances:
-            analysis['average_distance'] = sum(distances) / len(distances)
-        self._debug_print("Search Results Analysis", analysis)
-        return analysis
     def clear_history(self):
         """Clear conversation history"""
         self.conversation_history = []
         if self.debug:
-            print("✅ Conversation history cleared")
-    def get_history(self) -> List:
-        """Get conversation history"""
-        return self.conversation_history
-    def toggle_debug(self, enabled: bool):
-        """Toggle debug mode on/off"""
-        self.debug = enabled
-        print(f"🔍 Debug mode: {'ON' if enabled else 'OFF'}")

 """
+Enhanced RAG System - Individual Summarization + Vector Store Persistence
+Summarizes each image, text chunk, and table separately, then stores results
 """
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+import hashlib
+from config import (
+    OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
+    LANGUAGE, CACHE_RESPONSES, BATCH_SEARCH_RESULTS
+)
 class MultimodalRAG:
+    """
+    RAG system that:
+    1. Summarizes each component individually
+    2. Stores summaries in vector store
+    3. Enables fine-grained semantic search
+    """
     def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
         self.conversation_history = []
         self.language = LANGUAGE
+        self.summaries_log = []
         if self.debug:
+            print("✅ EnhancedMultimodalRAG initialized")
     def _debug_print(self, label: str, data: any):
         """Print debug information"""
             print(f"\n🔍 DEBUG [{label}]:")
             if isinstance(data, (list, dict)):
                 print(f"  Type: {type(data).__name__}")
+                print(f"  Content: {str(data)[:300]}...")
             else:
                 print(f"  {data}")
+    def summarize_image(self, image_ocr_text: str, image_idx: int) -> str:
+        """
+        Summarize a single image's OCR text
+        Returns concise summary focused on image content
+        """
+        if not image_ocr_text or len(image_ocr_text.strip()) < 5:
+            return f"[Image {image_idx}: No readable text or empty content]"
+        try:
+            prompt = f"""Summarize this text extracted from an image in {self.language}.
+Keep it concise but informative. Focus on key information, data, and visual elements.
+Image OCR Text:
+{image_ocr_text}
+Summary (2-3 sentences maximum):"""
+            message = HumanMessage(content=prompt)
+            response = self.llm.invoke([message])
+            summary = response.content.strip()
+            if self.debug:
+                self._debug_print(f"Image {image_idx} Summary", summary)
+            return summary
+        except Exception as e:
+            error_msg = f"[Image {image_idx}: Summarization failed - {str(e)}]"
+            print(f"Error summarizing image {image_idx}: {e}")
+            return error_msg
+    def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
+        """
+        Chunk text and summarize each chunk individually
+        Returns list of {chunk_text, summary, type, index}
+        """
+        chunks = []
+        # Split text into chunks
+        text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
+        self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
+        for idx, chunk in enumerate(text_chunks):
+            if len(chunk.strip()) < 50:  # Skip very small chunks
+                continue
+            try:
+                # Summarize chunk
+                prompt = f"""Summarize this text chunk in {self.language}.
+Keep it concise. Extract key points, facts, and main ideas.
+Text Chunk:
+{chunk}
+Summary (2-3 sentences maximum):"""
+                message = HumanMessage(content=prompt)
+                response = self.llm.invoke([message])
+                summary = response.content.strip()
+                chunks.append({
+                    'type': 'text_chunk',
+                    'chunk_index': len(chunks),
+                    'original_text': chunk[:500],  # Store first 500 chars
+                    'summary': summary,
+                    'chunk_length': len(chunk)
+                })
+                if self.debug:
+                    self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
+            except Exception as e:
+                print(f"Error summarizing text chunk: {e}")
+        return chunks
+    def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
+        """
+        Summarize each table individually
+        Returns list of {table_content, summary, type, index}
+        """
+        summaries = []
+        for idx, table in enumerate(tables):
+            table_content = table.get('content', '')
+            if not table_content or len(table_content.strip()) < 10:
+                continue
+            try:
+                # Summarize table
+                prompt = f"""Analyze and summarize this table/structured data in {self.language}.
+Extract key insights, row/column meanings, and important figures.
+Table Content:
+{table_content}
+Summary (2-3 sentences maximum):"""
+                message = HumanMessage(content=prompt)
+                response = self.llm.invoke([message])
+                summary = response.content.strip()
+                summaries.append({
+                    'type': 'table',
+                    'table_index': idx,
+                    'original_content': table_content[:500],
+                    'summary': summary,
+                    'table_length': len(table_content)
+                })
+                if self.debug:
+                    self._debug_print(f"Table {idx} Summary", summary)
+            except Exception as e:
+                print(f"Error summarizing table {idx}: {e}")
+        return summaries
+    def summarize_images(self, images: List[Dict]) -> List[Dict]:
+        """
+        Summarize each image individually
+        Returns list of {image_index, ocr_text, summary, type}
+        """
+        summaries = []
+        for idx, image in enumerate(images):
+            ocr_text = image.get('ocr_text', '')
+            summary = self.summarize_image(ocr_text, idx)
+            summaries.append({
+                'type': 'image',
+                'image_index': idx,
+                'original_ocr': ocr_text[:500],
+                'summary': summary,
+                'ocr_length': len(ocr_text)
+            })
+        return summaries
+    def process_and_store_document(
+        self,
+        text: str,
+        images: List[Dict],
+        tables: List[Dict],
+        vector_store,
+        doc_id: str
+    ) -> Dict:
+        """
+        Main function: Summarize all components and store in vector store
+        Returns summary statistics
+        """
+        print(f"\n{'='*70}")
+        print(f"PROCESSING AND STORING: {doc_id}")
+        print(f"{'='*70}")
+        results = {
+            'doc_id': doc_id,
+            'image_summaries': [],
+            'text_summaries': [],
+            'table_summaries': [],
+            'total_stored': 0
+        }
+        # 1. Summarize and store images
+        print(f"\n🖼️ PROCESSING IMAGES ({len(images)} total)")
+        print(f"{'─'*70}")
+        image_summaries = self.summarize_images(images)
+        results['image_summaries'] = image_summaries
+        # Store each image summary in vector store
+        image_docs = {
+            'text': ' | '.join([f"Image {s['image_index']}: {s['summary']}"
+                               for s in image_summaries]),
+            'images': [],
+            'tables': []
+        }
+        for summary in image_summaries:
+            print(f"  ✅ Image {summary['image_index']}: {summary['summary'][:50]}...")
+        if image_summaries:
+            try:
+                vector_store.add_documents(
+                    image_docs,
+                    f"{doc_id}_images"
+                )
+                results['total_stored'] += len(image_summaries)
+                print(f"✅ Stored {len(image_summaries)} image summaries")
+            except Exception as e:
+                print(f"❌ Error storing image summaries: {e}")
+        # 2. Summarize and store text chunks
+        print(f"\n📝 PROCESSING TEXT CHUNKS")
+        print(f"{'─'*70}")
+        text_summaries = self.summarize_text_chunks(text)
+        results['text_summaries'] = text_summaries
+        # Store each text chunk summary in vector store
+        text_docs = {
+            'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
+                               for s in text_summaries]),
+            'images': [],
+            'tables': []
         }
+        for summary in text_summaries:
+            print(f"  ✅ Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
+        if text_summaries:
+            try:
+                vector_store.add_documents(
+                    text_docs,
+                    f"{doc_id}_text_chunks"
+                )
+                results['total_stored'] += len(text_summaries)
+                print(f"✅ Stored {len(text_summaries)} text chunk summaries")
+            except Exception as e:
+                print(f"❌ Error storing text summaries: {e}")
+        # 3. Summarize and store tables
+        print(f"\n📋 PROCESSING TABLES ({len(tables)} total)")
+        print(f"{'─'*70}")
+        table_summaries = self.summarize_tables(tables)
+        results['table_summaries'] = table_summaries
+        # Store each table summary in vector store
+        table_docs = {
+            'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
+                               for s in table_summaries]),
+            'images': [],
+            'tables': []
+        }
+        for summary in table_summaries:
+            print(f"  ✅ Table {summary['table_index']}: {summary['summary'][:50]}...")
+        if table_summaries:
+            try:
+                vector_store.add_documents(
+                    table_docs,
+                    f"{doc_id}_tables"
+                )
+                results['total_stored'] += len(table_summaries)
+                print(f"✅ Stored {len(table_summaries)} table summaries")
+            except Exception as e:
+                print(f"❌ Error storing table summaries: {e}")
+        # 4. Summary statistics
+        print(f"\n{'='*70}")
+        print(f"📊 STORAGE SUMMARY")
+        print(f"{'='*70}")
+        print(f"  Images summarized & stored: {len(image_summaries)}")
+        print(f"  Text chunks summarized & stored: {len(text_summaries)}")
+        print(f"  Tables summarized & stored: {len(table_summaries)}")
+        print(f"  Total items stored: {results['total_stored']}")
+        print(f"{'='*70}")
+        self.summaries_log.append(results)
+        return results
+    def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
+        """Split text into overlapping chunks"""
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = start + chunk_size
+            chunks.append(text[start:end])
+            start = end - overlap
+        return chunks
+    def get_summaries_log(self) -> List[Dict]:
+        """Get all processing logs"""
+        return self.summaries_log
     def clear_history(self):
         """Clear conversation history"""
         self.conversation_history = []
         if self.debug:
+            print("✅ Conversation history cleared")