final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 18, 2025

Commit

fa76eb3

1 Parent(s): ecab17a

debug

Browse files

Files changed (2) hide show

src/pdf_parser.py +76 -18
src/rag_system.py +216 -91

src/pdf_parser.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """
-PDF Parser Module for extracting text, images, and tables
 """
 import os
 import json
@@ -15,10 +14,28 @@ from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
 class PDFParser:
-    def __init__(self):
         self.docstore_path = Path(DOCSTORE_PATH)
         self.docstore_path.mkdir(exist_ok=True)
         self.processed_files = self._load_processed_files()
     def _load_processed_files(self) -> Dict[str, str]:
         """Load list of already processed files with their hashes"""
@@ -49,23 +66,48 @@ class PDFParser:
         try:
             with open(pdf_path, 'rb') as file:
                 reader = PyPDF2.PdfReader(file)
-                for page in reader.pages:
-                    text += page.extract_text() + "\n"
         except Exception as e:
-            print(f"Error extracting text: {e}")
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Extract images from PDF pages"""
         images_data = []
         try:
             images = convert_from_path(pdf_path, dpi=150)
             for idx, image in enumerate(images):
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
-                # Extract text from image using OCR
-                ocr_text = pytesseract.image_to_string(image, lang='rus+eng')
                 images_data.append({
                     'page': idx,
@@ -74,19 +116,20 @@ class PDFParser:
                     'description': f"Image from page {idx + 1}"
                 })
         except Exception as e:
-            print(f"Error extracting images: {e}")
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         """Extract table content from PDF"""
         tables_data = []
         try:
-            # For simple table extraction, we'll use text patterns
-            # For advanced table detection, consider using 'tabula-py' or 'pdfplumber'
             text = self._extract_text_from_pdf(pdf_path)
-            # Basic table detection (lines with multiple spaces or separators)
             lines = text.split('\n')
             current_table = []
             for line in lines:
                 if '|' in line or '\t' in line:
@@ -104,28 +147,41 @@ class PDFParser:
                     'content': '\n'.join(current_table),
                     'description': f"Table {len(tables_data) + 1}"
                 })
         except Exception as e:
-            print(f"Error extracting tables: {e}")
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
-        """Parse PDF and extract text, images, and tables"""
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
         # Check if file was already processed
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
-                print(f"File {doc_id} already processed, skipping...")
                 return self._load_extracted_data(doc_id)
-        print(f"Processing PDF: {doc_id}")
         # Extract content
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
         # Save extracted data
         self._save_extracted_data(doc_id, text, images, tables)
@@ -145,6 +201,8 @@ class PDFParser:
         data_path = self.docstore_path / f"{doc_id}_data.json"
         with open(data_path, 'w', encoding='utf-8') as f:
             json.dump(data, f, ensure_ascii=False, indent=2)
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
         """Load previously extracted data from docstore"""
@@ -166,4 +224,4 @@ class PDFParser:
                     all_docs[doc_id] = json.load(f)
             except:
                 pass
-        return all_docs

 """
+PDF Parser Module with DEBUG for image extraction
 """
 import os
 import json
 class PDFParser:
+    def __init__(self, debug: bool = True):
         self.docstore_path = Path(DOCSTORE_PATH)
         self.docstore_path.mkdir(exist_ok=True)
         self.processed_files = self._load_processed_files()
+        self.debug = debug
+        if self.debug:
+            print("✅ PDFParser initialized with DEBUG mode ON")
+    def _debug_print(self, label: str, data: any):
+        """Print debug information"""
+        if self.debug:
+            print(f"\n🔍 [PDF Parser] {label}")
+            if isinstance(data, dict):
+                for key, val in data.items():
+                    print(f"  {key}: {val}")
+            elif isinstance(data, (list, tuple)):
+                print(f"  Count: {len(data)}")
+                for i, item in enumerate(data[:3]):
+                    print(f"  [{i}]: {str(item)[:100]}")
+            else:
+                print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
         """Load list of already processed files with their hashes"""
         try:
             with open(pdf_path, 'rb') as file:
                 reader = PyPDF2.PdfReader(file)
+                page_count = len(reader.pages)
+                self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
+                for page_num, page in enumerate(reader.pages):
+                    page_text = page.extract_text()
+                    text += page_text + "\n"
+                    self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
         except Exception as e:
+            self._debug_print("ERROR extracting text", str(e))
+        self._debug_print("Total Text Extracted", len(text))
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
+        """Extract images from PDF pages with detailed debugging"""
         images_data = []
         try:
+            self._debug_print("Image Extraction Started", f"File: {pdf_path}")
             images = convert_from_path(pdf_path, dpi=150)
+            self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
             for idx, image in enumerate(images):
+                self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
+                # Save image
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
+                self._debug_print(f"Image {idx} Saved", str(image_path))
+                # Extract text using OCR
+                self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
+                try:
+                    ocr_text = pytesseract.image_to_string(image, lang='rus+eng')
+                    self._debug_print(f"Image {idx} OCR Result", f"Length: {len(ocr_text)}, Content: {ocr_text[:200] if ocr_text else 'EMPTY'}")
+                    if not ocr_text or len(ocr_text.strip()) < 5:
+                        self._debug_print(f"Image {idx} WARNING", "⚠️ OCR returned empty or very short text!")
+                except Exception as ocr_error:
+                    self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
+                    ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
                 images_data.append({
                     'page': idx,
                     'description': f"Image from page {idx + 1}"
                 })
         except Exception as e:
+            self._debug_print("ERROR extracting images", str(e))
+        self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         """Extract table content from PDF"""
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
             lines = text.split('\n')
+            self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
             current_table = []
             for line in lines:
                 if '|' in line or '\t' in line:
                     'content': '\n'.join(current_table),
                     'description': f"Table {len(tables_data) + 1}"
                 })
+            self._debug_print("Tables Found", len(tables_data))
         except Exception as e:
+            self._debug_print("ERROR extracting tables", str(e))
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
+        """Parse PDF and extract text, images, and tables with debug output"""
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
+        self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
         # Check if file was already processed
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
+                self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
                 return self._load_extracted_data(doc_id)
+        print(f"\n📄 Processing PDF: {doc_id}")
         # Extract content
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
+        # Summary
+        self._debug_print("Extraction Summary", {
+            'text_length': len(text),
+            'images_count': len(images),
+            'tables_count': len(tables),
+            'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
+        })
         # Save extracted data
         self._save_extracted_data(doc_id, text, images, tables)
         data_path = self.docstore_path / f"{doc_id}_data.json"
         with open(data_path, 'w', encoding='utf-8') as f:
             json.dump(data, f, ensure_ascii=False, indent=2)
+        self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
         """Load previously extracted data from docstore"""
                     all_docs[doc_id] = json.load(f)
             except:
                 pass
+        return all_docs

src/rag_system.py CHANGED Viewed

@@ -1,142 +1,267 @@
 """
-Token-Optimized RAG System with Caching and Prompt Compression
 """
-import json
-import hashlib
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
-from config import (
-    OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
-    LANGUAGE, CACHE_RESPONSES, BATCH_SEARCH_RESULTS
-)
 class MultimodalRAG:
-    """RAG system optimized for minimal token usage"""
-    def __init__(self, api_key: str = None):
         api_key = api_key or OPENAI_API_KEY
-        # Use gpt-4o-mini for 20% cost reduction
         self.llm = ChatOpenAI(
             model_name=OPENAI_MODEL,
-            openai_api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         )
-        self.response_cache = {}  # Cache responses
-        self.doc_summaries = {}   # Store doc summaries
         self.language = LANGUAGE
-    def _get_cache_key(self, query: str) -> str:
-        """Generate cache key for query"""
-        return hashlib.md5(query.encode()).hexdigest()
-    def _compress_context(self, search_results: List[Dict]) -> str:
-        """Compress context to minimal tokens"""
-        context = ""
-        for idx, result in enumerate(search_results[:BATCH_SEARCH_RESULTS], 1):
-            content = result.get('content', '')[:200]  # Limit to 200 chars
-            content_type = result.get('type', 'text')
             if content_type == 'image':
-                context += f"[IMG{idx}]{content}\n"
             elif content_type == 'table':
-                context += f"[TBL{idx}]{content}\n"
             else:
-                context += f"[{idx}]{content}\n"
         return context
-    def answer_question(self, question: str, search_results: List[Dict]) -> str:
-        """Generate answer with minimal tokens"""
-        # Check cache first
-        if CACHE_RESPONSES:
-            cache_key = self._get_cache_key(question)
-            if cache_key in self.response_cache:
-                return self.response_cache[cache_key]
         try:
-            # Compress context aggressively
-            context = self._compress_context(search_results)
-            # Ultra-compact prompt
-            prompt = f"""Q:{question}
-C:{context}
-A:"""
-            message = HumanMessage(content=prompt)
-            response = self.llm([message])
-            answer = response.content
-            # Cache response
-            if CACHE_RESPONSES:
-                self.response_cache[cache_key] = answer
-            return answer
         except Exception as e:
-            return f"Error: {str(e)}"
-    def quick_summarize(self, text: str, doc_id: str) -> str:
-        """Summarize document once and cache"""
-        if doc_id in self.doc_summaries:
-            return self.doc_summaries[doc_id]
         try:
-            # Truncate text to first 2000 chars
-            text = text[:2000]
-            prompt = f"""Summarize in 50 words:
-{text}"""
-            message = HumanMessage(content=prompt)
-            response = self.llm([message])
-            summary = response.content
-            # Cache
-            self.doc_summaries[doc_id] = summary
-            return summary
         except Exception as e:
-            return f"Error: {str(e)}"
-    def batch_questions(self, questions: List[str], search_results: List[Dict]) -> List[str]:
-        """Answer multiple questions in one API call"""
-        try:
-            context = self._compress_context(search_results)
-            # Combine questions
-            qa_prompt = "Answer concisely:\n"
-            for i, q in enumerate(questions, 1):
-                qa_prompt += f"Q{i}:{q}\n"
-            qa_prompt += f"Context:{context}\nAnswers:"
-            message = HumanMessage(content=qa_prompt)
-            response = self.llm([message])
-            # Parse responses
-            answers = response.content.split('\n')
-            return answers[:len(questions)]
-        except Exception as e:
-            return [f"Error: {str(e)}"] * len(questions)
-    def clear_cache(self):
-        """Clear response cache"""
-        self.response_cache.clear()
-    def get_cache_stats(self) -> Dict:
-        """Get cache statistics"""
-        return {
-            'cached_responses': len(self.response_cache),
-            'cached_summaries': len(self.doc_summaries),
-            'total_cache_size': len(json.dumps(self.response_cache))
-        }

 """
+LLM Integration Module using OpenAI GPT-4o and LangChain
+FIXED for LangChain 0.1+ with IMAGE DEBUGGING
 """
 from typing import List, Dict
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
+import os
+from config import OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS, LANGUAGE
 class MultimodalRAG:
+    """RAG system with multimodal support using LangChain and OpenAI"""
+    def __init__(self, api_key: str = None, debug: bool = True):
         api_key = api_key or OPENAI_API_KEY
+        self.debug = debug
         self.llm = ChatOpenAI(
             model_name=OPENAI_MODEL,
+            api_key=api_key,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         )
+        self.conversation_history = []
         self.language = LANGUAGE
+        if self.debug:
+            print("✅ MultimodalRAG initialized with DEBUG mode ON")
+    def _debug_print(self, label: str, data: any):
+        """Print debug information"""
+        if self.debug:
+            print(f"\n🔍 DEBUG [{label}]:")
+            if isinstance(data, (list, dict)):
+                print(f"  Type: {type(data).__name__}")
+                print(f"  Content: {str(data)[:500]}...")
+            else:
+                print(f"  {data}")
+    def _build_context_prompt(self, search_results: List[Dict]) -> str:
+        """Build context from search results with debug info"""
+        context = "Based on the following document content:\n\n"
+        self._debug_print("Search Results Count", len(search_results))
+        # Separate by type for debugging
+        text_count = 0
+        image_count = 0
+        table_count = 0
+        for idx, result in enumerate(search_results, 1):
+            content_type = result.get('type', 'unknown')
+            content = result.get('content', '')
+            distance = result.get('distance', 0)
+            # Track counts
+            if content_type == 'image':
+                image_count += 1
+            elif content_type == 'table':
+                table_count += 1
+            else:
+                text_count += 1
+            self._debug_print(
+                f"Result {idx}: Type={content_type}, Distance={distance:.3f}, Length={len(content)}",
+                content[:100]
+            )
             if content_type == 'image':
+                context += f"[Image {idx}] {content}\n\n"
             elif content_type == 'table':
+                context += f"[Table {idx}] {content}\n\n"
             else:
+                context += f"[Text {idx}] {content}\n\n"
+        self._debug_print("Context Composition",
+            f"Text: {text_count}, Images: {image_count}, Tables: {table_count}")
+        self._debug_print("Total Context Length", len(context))
         return context
+    def answer_question(self, question: str, search_results: List[Dict], streaming: bool = False) -> str:
+        """Generate answer to user question based on search results"""
         try:
+            self._debug_print("Question", question)
+            # Build context from search results
+            context = self._build_context_prompt(search_results)
+            # Create system message
+            system_message = SystemMessage(
+                content=f"""You are a helpful assistant that answers questions about documents.
+                You work with documents that contain text, tables, and images.
+                Language: {self.language}
+                Provide accurate, concise answers based on the provided context.
+                If information is not found in the context, say so clearly.
+                For tables and images, provide detailed analysis when relevant."""
+            )
+            # Create user message with context
+            user_message = HumanMessage(
+                content=f"{context}\n\nQuestion: {question}\n\nPlease answer based on the context above."
+            )
+            self._debug_print("User Message Length", len(user_message.content))
+            # Add to conversation history
+            self.conversation_history.append(user_message)
+            # Get response using .invoke() instead of calling object directly
+            self._debug_print("Calling LLM", f"Model: {OPENAI_MODEL}")
+            response = self.llm.invoke([system_message] + self.conversation_history)
+            # Add response to history
+            self.conversation_history.append(response)
+            self._debug_print("Response Length", len(response.content))
+            # Keep conversation history manageable (last 10 messages)
+            if len(self.conversation_history) > 10:
+                self.conversation_history = self.conversation_history[-10:]
+            return response.content
         except Exception as e:
+            self._debug_print("ERROR in answer_question", str(e))
+            print(f"Error generating answer: {e}")
+            return f"Error: Could not generate answer. {str(e)}"
+    def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
+        """Summarize extracted document content including images and tables"""
         try:
+            if images is None:
+                images = []
+            if tables is None:
+                tables = []
+            self._debug_print("Document Summarization Started",
+                f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
+            # Extract OCR text from images
+            image_ocr_texts = []
+            for idx, img in enumerate(images):
+                ocr_text = img.get('ocr_text', '')
+                if ocr_text:
+                    image_ocr_texts.append(f"Image {idx}: {ocr_text}")
+                    self._debug_print(f"Image {idx} OCR", ocr_text[:100])
+                else:
+                    self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
+            # Extract table content
+            table_texts = []
+            for idx, tbl in enumerate(tables):
+                table_content = tbl.get('content', '')
+                if table_content:
+                    table_texts.append(f"Table {idx}:\n{table_content}")
+                    self._debug_print(f"Table {idx} Content", table_content[:100])
+                else:
+                    self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
+            # Build comprehensive summary prompt
+            summary_prompt = f"""Please provide a comprehensive summary of the following document content in {self.language}.
+Document Text:
+{document_content}
+"""
+            # Add images if they have OCR text
+            if image_ocr_texts:
+                summary_prompt += f"\nExtracted text from {len(images)} images:\n"
+                summary_prompt += "\n".join(image_ocr_texts)
+                summary_prompt += "\n"
+            # Add tables
+            if table_texts:
+                summary_prompt += f"\nDocument contains {len(tables)} tables:\n"
+                summary_prompt += "\n".join(table_texts)
+                summary_prompt += "\n"
+            summary_prompt += f"""
+Please include in your summary:
+1. Main topics covered
+2. Key points and findings
+3. Important data and numbers
+4. Key information from images (if present)
+5. Key information from tables (if present)
+6. Overall document purpose"""
+            self._debug_print("Summary Prompt Length", len(summary_prompt))
+            self._debug_print("Summary Prompt Content", summary_prompt[:200])
+            message = HumanMessage(content=summary_prompt)
+            self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
+            response = self.llm.invoke([message])
+            self._debug_print("Summary Response Length", len(response.content))
+            return response.content
         except Exception as e:
+            self._debug_print("ERROR in summarize_document", str(e))
+            print(f"Error summarizing document: {e}")
+            return f"Error: Could not summarize document. {str(e)}"
+    def debug_search_results(self, search_results: List[Dict]) -> Dict:
+        """Detailed analysis of search results for debugging"""
+        analysis = {
+            'total_results': len(search_results),
+            'by_type': {'text': 0, 'image': 0, 'table': 0},
+            'average_distance': 0,
+            'images_with_content': 0,
+            'images_empty': 0,
+            'details': []
+        }
+        distances = []
+        for idx, result in enumerate(search_results):
+            content_type = result.get('type', 'unknown')
+            content = result.get('content', '')
+            distance = result.get('distance', 0)
+            if content_type in analysis['by_type']:
+                analysis['by_type'][content_type] += 1
+            distances.append(distance)
+            # Track image specifics
+            if content_type == 'image':
+                if content.strip():
+                    analysis['images_with_content'] += 1
+                else:
+                    analysis['images_empty'] += 1
+            analysis['details'].append({
+                'index': idx,
+                'type': content_type,
+                'distance': distance,
+                'content_length': len(content),
+                'has_content': bool(content.strip())
+            })
+        if distances:
+            analysis['average_distance'] = sum(distances) / len(distances)
+        self._debug_print("Search Results Analysis", analysis)
+        return analysis
+    def clear_history(self):
+        """Clear conversation history"""
+        self.conversation_history = []
+        if self.debug:
+            print("✅ Conversation history cleared")
+    def get_history(self) -> List:
+        """Get conversation history"""
+        return self.conversation_history
+    def toggle_debug(self, enabled: bool):
+        """Toggle debug mode on/off"""
+        self.debug = enabled
+        print(f"🔍 Debug mode: {'ON' if enabled else 'OFF'}")