dnj0 commited on
Commit
fa76eb3
·
1 Parent(s): ecab17a
Files changed (2) hide show
  1. src/pdf_parser.py +76 -18
  2. src/rag_system.py +216 -91
src/pdf_parser.py CHANGED
@@ -1,6 +1,5 @@
1
-
2
  """
3
- PDF Parser Module for extracting text, images, and tables
4
  """
5
  import os
6
  import json
@@ -15,10 +14,28 @@ from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
15
 
16
 
17
  class PDFParser:
18
- def __init__(self):
19
  self.docstore_path = Path(DOCSTORE_PATH)
20
  self.docstore_path.mkdir(exist_ok=True)
21
  self.processed_files = self._load_processed_files()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def _load_processed_files(self) -> Dict[str, str]:
24
  """Load list of already processed files with their hashes"""
@@ -49,23 +66,48 @@ class PDFParser:
49
  try:
50
  with open(pdf_path, 'rb') as file:
51
  reader = PyPDF2.PdfReader(file)
52
- for page in reader.pages:
53
- text += page.extract_text() + "\n"
 
 
 
 
 
54
  except Exception as e:
55
- print(f"Error extracting text: {e}")
 
 
56
  return text
57
 
58
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
59
- """Extract images from PDF pages"""
60
  images_data = []
61
  try:
 
 
62
  images = convert_from_path(pdf_path, dpi=150)
 
 
63
  for idx, image in enumerate(images):
 
 
 
64
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
65
  image.save(image_path)
 
66
 
67
- # Extract text from image using OCR
68
- ocr_text = pytesseract.image_to_string(image, lang='rus+eng')
 
 
 
 
 
 
 
 
 
 
69
 
70
  images_data.append({
71
  'page': idx,
@@ -74,19 +116,20 @@ class PDFParser:
74
  'description': f"Image from page {idx + 1}"
75
  })
76
  except Exception as e:
77
- print(f"Error extracting images: {e}")
 
 
78
  return images_data
79
 
80
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
81
  """Extract table content from PDF"""
82
  tables_data = []
83
  try:
84
- # For simple table extraction, we'll use text patterns
85
- # For advanced table detection, consider using 'tabula-py' or 'pdfplumber'
86
  text = self._extract_text_from_pdf(pdf_path)
87
- # Basic table detection (lines with multiple spaces or separators)
88
  lines = text.split('\n')
89
 
 
 
90
  current_table = []
91
  for line in lines:
92
  if '|' in line or '\t' in line:
@@ -104,28 +147,41 @@ class PDFParser:
104
  'content': '\n'.join(current_table),
105
  'description': f"Table {len(tables_data) + 1}"
106
  })
 
 
107
  except Exception as e:
108
- print(f"Error extracting tables: {e}")
 
109
  return tables_data
110
 
111
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
112
- """Parse PDF and extract text, images, and tables"""
113
  file_hash = self._get_file_hash(pdf_path)
114
  doc_id = Path(pdf_path).stem
115
 
 
 
116
  # Check if file was already processed
117
  if doc_id in self.processed_files:
118
  if self.processed_files[doc_id] == file_hash:
119
- print(f"File {doc_id} already processed, skipping...")
120
  return self._load_extracted_data(doc_id)
121
 
122
- print(f"Processing PDF: {doc_id}")
123
 
124
  # Extract content
125
  text = self._extract_text_from_pdf(pdf_path)
126
  images = self._extract_images_from_pdf(pdf_path, doc_id)
127
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
128
 
 
 
 
 
 
 
 
 
129
  # Save extracted data
130
  self._save_extracted_data(doc_id, text, images, tables)
131
 
@@ -145,6 +201,8 @@ class PDFParser:
145
  data_path = self.docstore_path / f"{doc_id}_data.json"
146
  with open(data_path, 'w', encoding='utf-8') as f:
147
  json.dump(data, f, ensure_ascii=False, indent=2)
 
 
148
 
149
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
150
  """Load previously extracted data from docstore"""
@@ -166,4 +224,4 @@ class PDFParser:
166
  all_docs[doc_id] = json.load(f)
167
  except:
168
  pass
169
- return all_docs
 
 
1
  """
2
+ PDF Parser Module with DEBUG for image extraction
3
  """
4
  import os
5
  import json
 
14
 
15
 
16
  class PDFParser:
17
+ def __init__(self, debug: bool = True):
18
  self.docstore_path = Path(DOCSTORE_PATH)
19
  self.docstore_path.mkdir(exist_ok=True)
20
  self.processed_files = self._load_processed_files()
21
+ self.debug = debug
22
+
23
+ if self.debug:
24
+ print("✅ PDFParser initialized with DEBUG mode ON")
25
+
26
+ def _debug_print(self, label: str, data: any):
27
+ """Print debug information"""
28
+ if self.debug:
29
+ print(f"\n🔍 [PDF Parser] {label}")
30
+ if isinstance(data, dict):
31
+ for key, val in data.items():
32
+ print(f" {key}: {val}")
33
+ elif isinstance(data, (list, tuple)):
34
+ print(f" Count: {len(data)}")
35
+ for i, item in enumerate(data[:3]):
36
+ print(f" [{i}]: {str(item)[:100]}")
37
+ else:
38
+ print(f" {data}")
39
 
40
  def _load_processed_files(self) -> Dict[str, str]:
41
  """Load list of already processed files with their hashes"""
 
66
  try:
67
  with open(pdf_path, 'rb') as file:
68
  reader = PyPDF2.PdfReader(file)
69
+ page_count = len(reader.pages)
70
+ self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
71
+
72
+ for page_num, page in enumerate(reader.pages):
73
+ page_text = page.extract_text()
74
+ text += page_text + "\n"
75
+ self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
76
  except Exception as e:
77
+ self._debug_print("ERROR extracting text", str(e))
78
+
79
+ self._debug_print("Total Text Extracted", len(text))
80
  return text
81
 
82
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
83
+ """Extract images from PDF pages with detailed debugging"""
84
  images_data = []
85
  try:
86
+ self._debug_print("Image Extraction Started", f"File: {pdf_path}")
87
+
88
  images = convert_from_path(pdf_path, dpi=150)
89
+ self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
90
+
91
  for idx, image in enumerate(images):
92
+ self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
93
+
94
+ # Save image
95
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
96
  image.save(image_path)
97
+ self._debug_print(f"Image {idx} Saved", str(image_path))
98
 
99
+ # Extract text using OCR
100
+ self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
101
+
102
+ try:
103
+ ocr_text = pytesseract.image_to_string(image, lang='rus+eng')
104
+ self._debug_print(f"Image {idx} OCR Result", f"Length: {len(ocr_text)}, Content: {ocr_text[:200] if ocr_text else 'EMPTY'}")
105
+
106
+ if not ocr_text or len(ocr_text.strip()) < 5:
107
+ self._debug_print(f"Image {idx} WARNING", "⚠️ OCR returned empty or very short text!")
108
+ except Exception as ocr_error:
109
+ self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
110
+ ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
111
 
112
  images_data.append({
113
  'page': idx,
 
116
  'description': f"Image from page {idx + 1}"
117
  })
118
  except Exception as e:
119
+ self._debug_print("ERROR extracting images", str(e))
120
+
121
+ self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
122
  return images_data
123
 
124
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
125
  """Extract table content from PDF"""
126
  tables_data = []
127
  try:
 
 
128
  text = self._extract_text_from_pdf(pdf_path)
 
129
  lines = text.split('\n')
130
 
131
+ self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
132
+
133
  current_table = []
134
  for line in lines:
135
  if '|' in line or '\t' in line:
 
147
  'content': '\n'.join(current_table),
148
  'description': f"Table {len(tables_data) + 1}"
149
  })
150
+
151
+ self._debug_print("Tables Found", len(tables_data))
152
  except Exception as e:
153
+ self._debug_print("ERROR extracting tables", str(e))
154
+
155
  return tables_data
156
 
157
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
158
+ """Parse PDF and extract text, images, and tables with debug output"""
159
  file_hash = self._get_file_hash(pdf_path)
160
  doc_id = Path(pdf_path).stem
161
 
162
+ self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
163
+
164
  # Check if file was already processed
165
  if doc_id in self.processed_files:
166
  if self.processed_files[doc_id] == file_hash:
167
+ self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
168
  return self._load_extracted_data(doc_id)
169
 
170
+ print(f"\n📄 Processing PDF: {doc_id}")
171
 
172
  # Extract content
173
  text = self._extract_text_from_pdf(pdf_path)
174
  images = self._extract_images_from_pdf(pdf_path, doc_id)
175
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
176
 
177
+ # Summary
178
+ self._debug_print("Extraction Summary", {
179
+ 'text_length': len(text),
180
+ 'images_count': len(images),
181
+ 'tables_count': len(tables),
182
+ 'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
183
+ })
184
+
185
  # Save extracted data
186
  self._save_extracted_data(doc_id, text, images, tables)
187
 
 
201
  data_path = self.docstore_path / f"{doc_id}_data.json"
202
  with open(data_path, 'w', encoding='utf-8') as f:
203
  json.dump(data, f, ensure_ascii=False, indent=2)
204
+
205
+ self._debug_print("Data Saved", str(data_path))
206
 
207
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
208
  """Load previously extracted data from docstore"""
 
224
  all_docs[doc_id] = json.load(f)
225
  except:
226
  pass
227
+ return all_docs
src/rag_system.py CHANGED
@@ -1,142 +1,267 @@
1
  """
2
- Token-Optimized RAG System with Caching and Prompt Compression
 
3
  """
4
- import json
5
- import hashlib
6
  from typing import List, Dict
7
  from langchain_openai import ChatOpenAI
8
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
9
- from config import (
10
- OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
11
- LANGUAGE, CACHE_RESPONSES, BATCH_SEARCH_RESULTS
12
- )
13
 
14
 
15
  class MultimodalRAG:
16
- """RAG system optimized for minimal token usage"""
17
 
18
- def __init__(self, api_key: str = None):
19
  api_key = api_key or OPENAI_API_KEY
 
20
 
21
- # Use gpt-4o-mini for 20% cost reduction
22
  self.llm = ChatOpenAI(
23
  model_name=OPENAI_MODEL,
24
- openai_api_key=api_key,
25
  temperature=TEMPERATURE,
26
  max_tokens=MAX_TOKENS,
27
  )
28
 
29
- self.response_cache = {} # Cache responses
30
- self.doc_summaries = {} # Store doc summaries
31
  self.language = LANGUAGE
 
 
 
32
 
33
- def _get_cache_key(self, query: str) -> str:
34
- """Generate cache key for query"""
35
- return hashlib.md5(query.encode()).hexdigest()
 
 
 
 
 
 
36
 
37
- def _compress_context(self, search_results: List[Dict]) -> str:
38
- """Compress context to minimal tokens"""
39
- context = ""
 
 
40
 
41
- for idx, result in enumerate(search_results[:BATCH_SEARCH_RESULTS], 1):
42
- content = result.get('content', '')[:200] # Limit to 200 chars
43
- content_type = result.get('type', 'text')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  if content_type == 'image':
46
- context += f"[IMG{idx}]{content}\n"
47
  elif content_type == 'table':
48
- context += f"[TBL{idx}]{content}\n"
49
  else:
50
- context += f"[{idx}]{content}\n"
 
 
 
 
51
 
52
  return context
53
 
54
- def answer_question(self, question: str, search_results: List[Dict]) -> str:
55
- """Generate answer with minimal tokens"""
56
-
57
- # Check cache first
58
- if CACHE_RESPONSES:
59
- cache_key = self._get_cache_key(question)
60
- if cache_key in self.response_cache:
61
- return self.response_cache[cache_key]
62
-
63
  try:
64
- # Compress context aggressively
65
- context = self._compress_context(search_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # Ultra-compact prompt
68
- prompt = f"""Q:{question}
69
- C:{context}
70
- A:"""
71
 
72
- message = HumanMessage(content=prompt)
73
- response = self.llm([message])
74
- answer = response.content
75
 
76
- # Cache response
77
- if CACHE_RESPONSES:
78
- self.response_cache[cache_key] = answer
79
 
80
- return answer
 
 
 
 
 
 
 
 
 
81
 
82
  except Exception as e:
83
- return f"Error: {str(e)}"
 
 
84
 
85
- def quick_summarize(self, text: str, doc_id: str) -> str:
86
- """Summarize document once and cache"""
87
-
88
- if doc_id in self.doc_summaries:
89
- return self.doc_summaries[doc_id]
90
-
91
  try:
92
- # Truncate text to first 2000 chars
93
- text = text[:2000]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- prompt = f"""Summarize in 50 words:
96
- {text}"""
 
 
 
 
 
 
97
 
98
- message = HumanMessage(content=prompt)
99
- response = self.llm([message])
100
- summary = response.content
101
 
102
- # Cache
103
- self.doc_summaries[doc_id] = summary
104
- return summary
 
 
 
 
 
105
 
106
  except Exception as e:
107
- return f"Error: {str(e)}"
 
 
108
 
109
- def batch_questions(self, questions: List[str], search_results: List[Dict]) -> List[str]:
110
- """Answer multiple questions in one API call"""
 
 
 
 
 
 
 
 
111
 
112
- try:
113
- context = self._compress_context(search_results)
 
 
 
 
114
 
115
- # Combine questions
116
- qa_prompt = "Answer concisely:\n"
117
- for i, q in enumerate(questions, 1):
118
- qa_prompt += f"Q{i}:{q}\n"
119
 
120
- qa_prompt += f"Context:{context}\nAnswers:"
121
 
122
- message = HumanMessage(content=qa_prompt)
123
- response = self.llm([message])
 
 
 
 
124
 
125
- # Parse responses
126
- answers = response.content.split('\n')
127
- return answers[:len(questions)]
 
 
 
 
128
 
129
- except Exception as e:
130
- return [f"Error: {str(e)}"] * len(questions)
 
 
 
 
 
 
 
 
 
131
 
132
- def clear_cache(self):
133
- """Clear response cache"""
134
- self.response_cache.clear()
135
 
136
- def get_cache_stats(self) -> Dict:
137
- """Get cache statistics"""
138
- return {
139
- 'cached_responses': len(self.response_cache),
140
- 'cached_summaries': len(self.doc_summaries),
141
- 'total_cache_size': len(json.dumps(self.response_cache))
142
- }
 
1
  """
2
+ LLM Integration Module using OpenAI GPT-4o and LangChain
3
+ FIXED for LangChain 0.1+ with IMAGE DEBUGGING
4
  """
 
 
5
  from typing import List, Dict
6
  from langchain_openai import ChatOpenAI
7
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
8
+ import os
9
+ from config import OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS, LANGUAGE
 
 
10
 
11
 
12
  class MultimodalRAG:
13
+ """RAG system with multimodal support using LangChain and OpenAI"""
14
 
15
+ def __init__(self, api_key: str = None, debug: bool = True):
16
  api_key = api_key or OPENAI_API_KEY
17
+ self.debug = debug
18
 
 
19
  self.llm = ChatOpenAI(
20
  model_name=OPENAI_MODEL,
21
+ api_key=api_key,
22
  temperature=TEMPERATURE,
23
  max_tokens=MAX_TOKENS,
24
  )
25
 
26
+ self.conversation_history = []
 
27
  self.language = LANGUAGE
28
+
29
+ if self.debug:
30
+ print("✅ MultimodalRAG initialized with DEBUG mode ON")
31
 
32
+ def _debug_print(self, label: str, data: any):
33
+ """Print debug information"""
34
+ if self.debug:
35
+ print(f"\n🔍 DEBUG [{label}]:")
36
+ if isinstance(data, (list, dict)):
37
+ print(f" Type: {type(data).__name__}")
38
+ print(f" Content: {str(data)[:500]}...")
39
+ else:
40
+ print(f" {data}")
41
 
42
+ def _build_context_prompt(self, search_results: List[Dict]) -> str:
43
+ """Build context from search results with debug info"""
44
+ context = "Based on the following document content:\n\n"
45
+
46
+ self._debug_print("Search Results Count", len(search_results))
47
 
48
+ # Separate by type for debugging
49
+ text_count = 0
50
+ image_count = 0
51
+ table_count = 0
52
+
53
+ for idx, result in enumerate(search_results, 1):
54
+ content_type = result.get('type', 'unknown')
55
+ content = result.get('content', '')
56
+ distance = result.get('distance', 0)
57
+
58
+ # Track counts
59
+ if content_type == 'image':
60
+ image_count += 1
61
+ elif content_type == 'table':
62
+ table_count += 1
63
+ else:
64
+ text_count += 1
65
+
66
+ self._debug_print(
67
+ f"Result {idx}: Type={content_type}, Distance={distance:.3f}, Length={len(content)}",
68
+ content[:100]
69
+ )
70
 
71
  if content_type == 'image':
72
+ context += f"[Image {idx}] {content}\n\n"
73
  elif content_type == 'table':
74
+ context += f"[Table {idx}] {content}\n\n"
75
  else:
76
+ context += f"[Text {idx}] {content}\n\n"
77
+
78
+ self._debug_print("Context Composition",
79
+ f"Text: {text_count}, Images: {image_count}, Tables: {table_count}")
80
+ self._debug_print("Total Context Length", len(context))
81
 
82
  return context
83
 
84
+ def answer_question(self, question: str, search_results: List[Dict], streaming: bool = False) -> str:
85
+ """Generate answer to user question based on search results"""
 
 
 
 
 
 
 
86
  try:
87
+ self._debug_print("Question", question)
88
+
89
+ # Build context from search results
90
+ context = self._build_context_prompt(search_results)
91
+
92
+ # Create system message
93
+ system_message = SystemMessage(
94
+ content=f"""You are a helpful assistant that answers questions about documents.
95
+ You work with documents that contain text, tables, and images.
96
+ Language: {self.language}
97
+
98
+ Provide accurate, concise answers based on the provided context.
99
+ If information is not found in the context, say so clearly.
100
+ For tables and images, provide detailed analysis when relevant."""
101
+ )
102
+
103
+ # Create user message with context
104
+ user_message = HumanMessage(
105
+ content=f"{context}\n\nQuestion: {question}\n\nPlease answer based on the context above."
106
+ )
107
 
108
+ self._debug_print("User Message Length", len(user_message.content))
 
 
 
109
 
110
+ # Add to conversation history
111
+ self.conversation_history.append(user_message)
 
112
 
113
+ # Get response using .invoke() instead of calling object directly
114
+ self._debug_print("Calling LLM", f"Model: {OPENAI_MODEL}")
115
+ response = self.llm.invoke([system_message] + self.conversation_history)
116
 
117
+ # Add response to history
118
+ self.conversation_history.append(response)
119
+
120
+ self._debug_print("Response Length", len(response.content))
121
+
122
+ # Keep conversation history manageable (last 10 messages)
123
+ if len(self.conversation_history) > 10:
124
+ self.conversation_history = self.conversation_history[-10:]
125
+
126
+ return response.content
127
 
128
  except Exception as e:
129
+ self._debug_print("ERROR in answer_question", str(e))
130
+ print(f"Error generating answer: {e}")
131
+ return f"Error: Could not generate answer. {str(e)}"
132
 
133
+ def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
134
+ """Summarize extracted document content including images and tables"""
 
 
 
 
135
  try:
136
+ if images is None:
137
+ images = []
138
+ if tables is None:
139
+ tables = []
140
+
141
+ self._debug_print("Document Summarization Started",
142
+ f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
143
+
144
+ # Extract OCR text from images
145
+ image_ocr_texts = []
146
+ for idx, img in enumerate(images):
147
+ ocr_text = img.get('ocr_text', '')
148
+ if ocr_text:
149
+ image_ocr_texts.append(f"Image {idx}: {ocr_text}")
150
+ self._debug_print(f"Image {idx} OCR", ocr_text[:100])
151
+ else:
152
+ self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
153
+
154
+ # Extract table content
155
+ table_texts = []
156
+ for idx, tbl in enumerate(tables):
157
+ table_content = tbl.get('content', '')
158
+ if table_content:
159
+ table_texts.append(f"Table {idx}:\n{table_content}")
160
+ self._debug_print(f"Table {idx} Content", table_content[:100])
161
+ else:
162
+ self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
163
+
164
+ # Build comprehensive summary prompt
165
+ summary_prompt = f"""Please provide a comprehensive summary of the following document content in {self.language}.
166
+
167
+ Document Text:
168
+ {document_content}
169
+
170
+ """
171
+
172
+ # Add images if they have OCR text
173
+ if image_ocr_texts:
174
+ summary_prompt += f"\nExtracted text from {len(images)} images:\n"
175
+ summary_prompt += "\n".join(image_ocr_texts)
176
+ summary_prompt += "\n"
177
+
178
+ # Add tables
179
+ if table_texts:
180
+ summary_prompt += f"\nDocument contains {len(tables)} tables:\n"
181
+ summary_prompt += "\n".join(table_texts)
182
+ summary_prompt += "\n"
183
 
184
+ summary_prompt += f"""
185
+ Please include in your summary:
186
+ 1. Main topics covered
187
+ 2. Key points and findings
188
+ 3. Important data and numbers
189
+ 4. Key information from images (if present)
190
+ 5. Key information from tables (if present)
191
+ 6. Overall document purpose"""
192
 
193
+ self._debug_print("Summary Prompt Length", len(summary_prompt))
194
+ self._debug_print("Summary Prompt Content", summary_prompt[:200])
 
195
 
196
+ message = HumanMessage(content=summary_prompt)
197
+ self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
198
+
199
+ response = self.llm.invoke([message])
200
+
201
+ self._debug_print("Summary Response Length", len(response.content))
202
+
203
+ return response.content
204
 
205
  except Exception as e:
206
+ self._debug_print("ERROR in summarize_document", str(e))
207
+ print(f"Error summarizing document: {e}")
208
+ return f"Error: Could not summarize document. {str(e)}"
209
 
210
+ def debug_search_results(self, search_results: List[Dict]) -> Dict:
211
+ """Detailed analysis of search results for debugging"""
212
+ analysis = {
213
+ 'total_results': len(search_results),
214
+ 'by_type': {'text': 0, 'image': 0, 'table': 0},
215
+ 'average_distance': 0,
216
+ 'images_with_content': 0,
217
+ 'images_empty': 0,
218
+ 'details': []
219
+ }
220
 
221
+ distances = []
222
+
223
+ for idx, result in enumerate(search_results):
224
+ content_type = result.get('type', 'unknown')
225
+ content = result.get('content', '')
226
+ distance = result.get('distance', 0)
227
 
228
+ if content_type in analysis['by_type']:
229
+ analysis['by_type'][content_type] += 1
 
 
230
 
231
+ distances.append(distance)
232
 
233
+ # Track image specifics
234
+ if content_type == 'image':
235
+ if content.strip():
236
+ analysis['images_with_content'] += 1
237
+ else:
238
+ analysis['images_empty'] += 1
239
 
240
+ analysis['details'].append({
241
+ 'index': idx,
242
+ 'type': content_type,
243
+ 'distance': distance,
244
+ 'content_length': len(content),
245
+ 'has_content': bool(content.strip())
246
+ })
247
 
248
+ if distances:
249
+ analysis['average_distance'] = sum(distances) / len(distances)
250
+
251
+ self._debug_print("Search Results Analysis", analysis)
252
+ return analysis
253
+
254
+ def clear_history(self):
255
+ """Clear conversation history"""
256
+ self.conversation_history = []
257
+ if self.debug:
258
+ print("✅ Conversation history cleared")
259
 
260
+ def get_history(self) -> List:
261
+ """Get conversation history"""
262
+ return self.conversation_history
263
 
264
+ def toggle_debug(self, enabled: bool):
265
+ """Toggle debug mode on/off"""
266
+ self.debug = enabled
267
+ print(f"🔍 Debug mode: {'ON' if enabled else 'OFF'}")