final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 18, 2025

Commit

fc31dcc

verified ·

1 Parent(s): bf6fcc4

Update src/rag_system.py

Browse files

Files changed (1) hide show

src/rag_system.py +126 -4

src/rag_system.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 LLM Integration Module using OpenAI GPT-4o and LangChain
-FIXED for LangChain 0.1+ with IMAGE DEBUGGING
 """
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
@@ -25,6 +25,7 @@ class MultimodalRAG:
         self.conversation_history = []
         self.language = LANGUAGE
         if self.debug:
             print("✅ MultimodalRAG initialized with DEBUG mode ON")
@@ -131,7 +132,7 @@ class MultimodalRAG:
             return f"Error: Could not generate answer. {str(e)}"
     def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
-        """Summarize extracted document content including images and tables"""
         try:
             if images is None:
                 images = []
@@ -141,14 +142,40 @@ class MultimodalRAG:
             self._debug_print("Document Summarization Started",
                 f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
             # Extract OCR text from images
             image_ocr_texts = []
             for idx, img in enumerate(images):
                 ocr_text = img.get('ocr_text', '')
                 if ocr_text:
                     image_ocr_texts.append(f"Image {idx}: {ocr_text}")
                     self._debug_print(f"Image {idx} OCR", ocr_text[:100])
                 else:
                     self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
             # Extract table content
@@ -157,6 +184,11 @@ class MultimodalRAG:
                 table_content = tbl.get('content', '')
                 if table_content:
                     table_texts.append(f"Table {idx}:\n{table_content}")
                     self._debug_print(f"Table {idx} Content", table_content[:100])
                 else:
                     self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
@@ -190,6 +222,8 @@ Please include in your summary:
 5. Key information from tables (if present)
 6. Overall document purpose"""
             self._debug_print("Summary Prompt Length", len(summary_prompt))
             self._debug_print("Summary Prompt Content", summary_prompt[:200])
@@ -197,16 +231,104 @@ Please include in your summary:
             self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
             response = self.llm.invoke([message])
-            self._debug_print("Summary Response Length", len(response.content))
-            return response.content
         except Exception as e:
             self._debug_print("ERROR in summarize_document", str(e))
             print(f"Error summarizing document: {e}")
             return f"Error: Could not summarize document. {str(e)}"
     def debug_search_results(self, search_results: List[Dict]) -> Dict:
         """Detailed analysis of search results for debugging"""
         analysis = {

 """
 LLM Integration Module using OpenAI GPT-4o and LangChain
+FIXED for LangChain 0.1+ with IMAGE DEBUGGING + RESULT LOGGING
 """
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
         self.conversation_history = []
         self.language = LANGUAGE
+        self.summarization_log = []  # Track all summarizations
         if self.debug:
             print("✅ MultimodalRAG initialized with DEBUG mode ON")
             return f"Error: Could not generate answer. {str(e)}"
     def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
+        """Summarize extracted document content including images and tables with logging"""
         try:
             if images is None:
                 images = []
             self._debug_print("Document Summarization Started",
                 f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
+            # Log entry
+            log_entry = {
+                'document_text_length': len(document_content),
+                'total_images': len(images),
+                'total_tables': len(tables),
+                'images_with_ocr': 0,
+                'images_empty_ocr': 0,
+                'ocr_texts': [],
+                'table_texts': [],
+                'summary_prompt_length': 0,
+                'summary_result': '',
+                'summary_result_length': 0
+            }
             # Extract OCR text from images
             image_ocr_texts = []
             for idx, img in enumerate(images):
                 ocr_text = img.get('ocr_text', '')
                 if ocr_text:
                     image_ocr_texts.append(f"Image {idx}: {ocr_text}")
+                    log_entry['images_with_ocr'] += 1
+                    log_entry['ocr_texts'].append({
+                        'image_index': idx,
+                        'ocr_length': len(ocr_text),
+                        'ocr_content': ocr_text[:200]  # First 200 chars
+                    })
                     self._debug_print(f"Image {idx} OCR", ocr_text[:100])
                 else:
+                    log_entry['images_empty_ocr'] += 1
+                    log_entry['ocr_texts'].append({
+                        'image_index': idx,
+                        'ocr_length': 0,
+                        'ocr_content': 'EMPTY'
+                    })
                     self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
             # Extract table content
                 table_content = tbl.get('content', '')
                 if table_content:
                     table_texts.append(f"Table {idx}:\n{table_content}")
+                    log_entry['table_texts'].append({
+                        'table_index': idx,
+                        'table_length': len(table_content),
+                        'table_content': table_content[:200]
+                    })
                     self._debug_print(f"Table {idx} Content", table_content[:100])
                 else:
                     self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
 5. Key information from tables (if present)
 6. Overall document purpose"""
+            log_entry['summary_prompt_length'] = len(summary_prompt)
             self._debug_print("Summary Prompt Length", len(summary_prompt))
             self._debug_print("Summary Prompt Content", summary_prompt[:200])
             self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
             response = self.llm.invoke([message])
+            summary = response.content
+            log_entry['summary_result'] = summary
+            log_entry['summary_result_length'] = len(summary)
+            self._debug_print("Summary Response Length", len(summary))
+            # PRINT DETAILED SUMMARIZATION LOG
+            self._print_summarization_log(log_entry)
+            # Store in log
+            self.summarization_log.append(log_entry)
+            return summary
         except Exception as e:
             self._debug_print("ERROR in summarize_document", str(e))
             print(f"Error summarizing document: {e}")
             return f"Error: Could not summarize document. {str(e)}"
+    def _print_summarization_log(self, log_entry: Dict):
+        """Print detailed summarization results log"""
+        print("\n" + "="*70)
+        print("📊 IMAGE SUMMARIZATION LOG")
+        print("="*70)
+        # Document composition
+        print("\n📄 DOCUMENT COMPOSITION:")
+        print(f"  Text: {log_entry['document_text_length']:,} characters")
+        print(f"  Images: {log_entry['total_images']} total")
+        print(f"    ✅ With OCR text: {log_entry['images_with_ocr']}")
+        print(f"    ⚠️  Empty OCR: {log_entry['images_empty_ocr']}")
+        print(f"  Tables: {log_entry['total_tables']} total")
+        # Image OCR details
+        if log_entry['ocr_texts']:
+            print("\n🖼️  IMAGE OCR TEXT DETAILS:")
+            for ocr in log_entry['ocr_texts']:
+                idx = ocr['image_index']
+                length = ocr['ocr_length']
+                content = ocr['ocr_content']
+                if length == 0:
+                    print(f"  Image {idx}: ⚠️  EMPTY (0 chars)")
+                else:
+                    print(f"  Image {idx}: ✅ {length} characters")
+                    print(f"    Content: {content}...")
+        # Table details
+        if log_entry['table_texts']:
+            print("\n📋 TABLE DETAILS:")
+            for tbl in log_entry['table_texts']:
+                idx = tbl['table_index']
+                length = tbl['table_length']
+                content = tbl['table_content']
+                print(f"  Table {idx}: {length} characters")
+                print(f"    Content: {content}...")
+        # Prompt details
+        print("\n📝 SUMMARIZATION PROMPT:")
+        print(f"  Total length: {log_entry['summary_prompt_length']:,} characters")
+        print(f"  Includes images: {'✅ Yes' if log_entry['ocr_texts'] else '❌ No'}")
+        print(f"  Includes tables: {'✅ Yes' if log_entry['table_texts'] else '❌ No'}")
+        # Summary result
+        print("\n✨ SUMMARY RESULT:")
+        print(f"  Length: {log_entry['summary_result_length']:,} characters")
+        print(f"  Content:")
+        print("  " + "-"*66)
+        # Print summary with line wrapping
+        summary_lines = log_entry['summary_result'].split('\n')
+        for line in summary_lines[:15]:  # First 15 lines
+            print(f"  {line}")
+        if len(summary_lines) > 15:
+            print(f"  ... ({len(summary_lines) - 15} more lines)")
+        print("  " + "-"*66)
+        print("\n" + "="*70)
+    def get_summarization_log(self) -> List[Dict]:
+        """Get all summarization logs"""
+        return self.summarization_log
+    def print_summarization_history(self):
+        """Print all summarization logs"""
+        print("\n📚 SUMMARIZATION HISTORY:")
+        print(f"Total summarizations: {len(self.summarization_log)}")
+        for idx, log in enumerate(self.summarization_log, 1):
+            print(f"\n{'='*70}")
+            print(f"Summarization #{idx}")
+            print(f"{'='*70}")
+            self._print_summarization_log(log)
     def debug_search_results(self, search_results: List[Dict]) -> Dict:
         """Detailed analysis of search results for debugging"""
         analysis = {