dnj0 commited on
Commit
4f6cdd1
Β·
verified Β·
1 Parent(s): a6e26b8

Update src/rag_system.py

Browse files
Files changed (1) hide show
  1. src/rag_system.py +264 -320
src/rag_system.py CHANGED
@@ -1,16 +1,24 @@
1
  """
2
- LLM Integration Module using OpenAI GPT-4o and LangChain
3
- FIXED for LangChain 0.1+ with IMAGE DEBUGGING + RESULT LOGGING
4
  """
5
  from typing import List, Dict
6
  from langchain_openai import ChatOpenAI
7
- from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
8
- import os
9
- from config import OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS, LANGUAGE
 
 
 
10
 
11
 
12
  class MultimodalRAG:
13
- """RAG system with multimodal support using LangChain and OpenAI"""
 
 
 
 
 
14
 
15
  def __init__(self, api_key: str = None, debug: bool = True):
16
  api_key = api_key or OPENAI_API_KEY
@@ -25,10 +33,10 @@ class MultimodalRAG:
25
 
26
  self.conversation_history = []
27
  self.language = LANGUAGE
28
- self.summarization_log = [] # Track all summarizations
29
 
30
  if self.debug:
31
- print("βœ… MultimodalRAG initialized with DEBUG mode ON")
32
 
33
  def _debug_print(self, label: str, data: any):
34
  """Print debug information"""
@@ -36,354 +44,290 @@ class MultimodalRAG:
36
  print(f"\nπŸ” DEBUG [{label}]:")
37
  if isinstance(data, (list, dict)):
38
  print(f" Type: {type(data).__name__}")
39
- print(f" Content: {str(data)[:500]}...")
40
  else:
41
  print(f" {data}")
42
 
43
- def _build_context_prompt(self, search_results: List[Dict]) -> str:
44
- """Build context from search results with debug info"""
45
- context = "Based on the following document content:\n\n"
 
 
 
 
46
 
47
- self._debug_print("Search Results Count", len(search_results))
48
-
49
- # Separate by type for debugging
50
- text_count = 0
51
- image_count = 0
52
- table_count = 0
53
-
54
- for idx, result in enumerate(search_results, 1):
55
- content_type = result.get('type', 'unknown')
56
- content = result.get('content', '')
57
- distance = result.get('distance', 0)
58
 
59
- # Track counts
60
- if content_type == 'image':
61
- image_count += 1
62
- elif content_type == 'table':
63
- table_count += 1
64
- else:
65
- text_count += 1
66
 
67
- self._debug_print(
68
- f"Result {idx}: Type={content_type}, Distance={distance:.3f}, Length={len(content)}",
69
- content[:100]
70
- )
71
 
72
- if content_type == 'image':
73
- context += f"[Image {idx}] {content}\n\n"
74
- elif content_type == 'table':
75
- context += f"[Table {idx}] {content}\n\n"
76
- else:
77
- context += f"[Text {idx}] {content}\n\n"
 
 
 
 
 
 
 
 
 
78
 
79
- self._debug_print("Context Composition",
80
- f"Text: {text_count}, Images: {image_count}, Tables: {table_count}")
81
- self._debug_print("Total Context Length", len(context))
82
 
83
- return context
 
 
 
 
 
 
 
84
 
85
- def answer_question(self, question: str, search_results: List[Dict], streaming: bool = False) -> str:
86
- """Generate answer to user question based on search results"""
87
- try:
88
- self._debug_print("Question", question)
89
-
90
- # Build context from search results
91
- context = self._build_context_prompt(search_results)
92
-
93
- # Create system message
94
- system_message = SystemMessage(
95
- content=f"""You are a helpful assistant that answers questions about documents.
96
- You work with documents that contain text, tables, and images.
97
- Language: {self.language}
98
 
99
- Provide accurate, concise answers based on the provided context.
100
- If information is not found in the context, say so clearly.
101
- For tables and images, provide detailed analysis when relevant."""
102
- )
103
-
104
- # Create user message with context
105
- user_message = HumanMessage(
106
- content=f"{context}\n\nQuestion: {question}\n\nPlease answer based on the context above."
107
- )
108
-
109
- self._debug_print("User Message Length", len(user_message.content))
110
-
111
- # Add to conversation history
112
- self.conversation_history.append(user_message)
113
-
114
- # Get response using .invoke() instead of calling object directly
115
- self._debug_print("Calling LLM", f"Model: {OPENAI_MODEL}")
116
- response = self.llm.invoke([system_message] + self.conversation_history)
117
-
118
- # Add response to history
119
- self.conversation_history.append(response)
120
-
121
- self._debug_print("Response Length", len(response.content))
122
-
123
- # Keep conversation history manageable (last 10 messages)
124
- if len(self.conversation_history) > 10:
125
- self.conversation_history = self.conversation_history[-10:]
126
-
127
- return response.content
128
 
129
- except Exception as e:
130
- self._debug_print("ERROR in answer_question", str(e))
131
- print(f"Error generating answer: {e}")
132
- return f"Error: Could not generate answer. {str(e)}"
133
 
134
- def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
135
- """Summarize extracted document content including images and tables with logging"""
136
- try:
137
- if images is None:
138
- images = []
139
- if tables is None:
140
- tables = []
141
-
142
- self._debug_print("Document Summarization Started",
143
- f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
144
-
145
- # Log entry
146
- log_entry = {
147
- 'document_text_length': len(document_content),
148
- 'total_images': len(images),
149
- 'total_tables': len(tables),
150
- 'images_with_ocr': 0,
151
- 'images_empty_ocr': 0,
152
- 'ocr_texts': [],
153
- 'table_texts': [],
154
- 'summary_prompt_length': 0,
155
- 'summary_result': '',
156
- 'summary_result_length': 0
157
- }
158
-
159
- # Extract OCR text from images
160
- image_ocr_texts = []
161
- for idx, img in enumerate(images):
162
- ocr_text = img.get('ocr_text', '')
163
- if ocr_text:
164
- image_ocr_texts.append(f"Image {idx}: {ocr_text}")
165
- log_entry['images_with_ocr'] += 1
166
- log_entry['ocr_texts'].append({
167
- 'image_index': idx,
168
- 'ocr_length': len(ocr_text),
169
- 'ocr_content': ocr_text[:200] # First 200 chars
170
- })
171
- self._debug_print(f"Image {idx} OCR", ocr_text[:100])
172
- else:
173
- log_entry['images_empty_ocr'] += 1
174
- log_entry['ocr_texts'].append({
175
- 'image_index': idx,
176
- 'ocr_length': 0,
177
- 'ocr_content': 'EMPTY'
178
- })
179
- self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
180
 
181
- # Extract table content
182
- table_texts = []
183
- for idx, tbl in enumerate(tables):
184
- table_content = tbl.get('content', '')
185
- if table_content:
186
- table_texts.append(f"Table {idx}:\n{table_content}")
187
- log_entry['table_texts'].append({
188
- 'table_index': idx,
189
- 'table_length': len(table_content),
190
- 'table_content': table_content[:200]
191
- })
192
- self._debug_print(f"Table {idx} Content", table_content[:100])
193
- else:
194
- self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
195
 
196
- # Build comprehensive summary prompt
197
- summary_prompt = f"""Please provide a comprehensive summary of the following document content in {self.language}.
 
 
198
 
199
- Document Text:
200
- {document_content}
201
 
202
- """
203
-
204
- # Add images if they have OCR text
205
- if image_ocr_texts:
206
- summary_prompt += f"\nExtracted text from {len(images)} images:\n"
207
- summary_prompt += "\n".join(image_ocr_texts)
208
- summary_prompt += "\n"
209
-
210
- # Add tables
211
- if table_texts:
212
- summary_prompt += f"\nDocument contains {len(tables)} tables:\n"
213
- summary_prompt += "\n".join(table_texts)
214
- summary_prompt += "\n"
215
-
216
- summary_prompt += f"""
217
- Please include in your summary:
218
- 1. Main topics covered
219
- 2. Key points and findings
220
- 3. Important data and numbers
221
- 4. Key information from images (if present)
222
- 5. Key information from tables (if present)
223
- 6. Overall document purpose"""
224
-
225
- log_entry['summary_prompt_length'] = len(summary_prompt)
226
-
227
- self._debug_print("Summary Prompt Length", len(summary_prompt))
228
- self._debug_print("Summary Prompt Content", summary_prompt[:200])
229
-
230
- message = HumanMessage(content=summary_prompt)
231
- self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
232
-
233
- response = self.llm.invoke([message])
234
- summary = response.content
235
-
236
- log_entry['summary_result'] = summary
237
- log_entry['summary_result_length'] = len(summary)
238
-
239
- self._debug_print("Summary Response Length", len(summary))
240
-
241
- # PRINT DETAILED SUMMARIZATION LOG
242
- self._print_summarization_log(log_entry)
243
-
244
- # Store in log
245
- self.summarization_log.append(log_entry)
246
-
247
- return summary
248
 
249
- except Exception as e:
250
- self._debug_print("ERROR in summarize_document", str(e))
251
- print(f"Error summarizing document: {e}")
252
- return f"Error: Could not summarize document. {str(e)}"
253
 
254
- def _print_summarization_log(self, log_entry: Dict):
255
- """Print detailed summarization results log"""
256
- print("\n" + "="*70)
257
- print("πŸ“Š IMAGE SUMMARIZATION LOG")
258
- print("="*70)
 
259
 
260
- # Document composition
261
- print("\nπŸ“„ DOCUMENT COMPOSITION:")
262
- print(f" Text: {log_entry['document_text_length']:,} characters")
263
- print(f" Images: {log_entry['total_images']} total")
264
- print(f" βœ… With OCR text: {log_entry['images_with_ocr']}")
265
- print(f" ⚠️ Empty OCR: {log_entry['images_empty_ocr']}")
266
- print(f" Tables: {log_entry['total_tables']} total")
 
 
 
 
267
 
268
- # Image OCR details
269
- if log_entry['ocr_texts']:
270
- print("\nπŸ–ΌοΈ IMAGE OCR TEXT DETAILS:")
271
- for ocr in log_entry['ocr_texts']:
272
- idx = ocr['image_index']
273
- length = ocr['ocr_length']
274
- content = ocr['ocr_content']
275
-
276
- if length == 0:
277
- print(f" Image {idx}: ⚠️ EMPTY (0 chars)")
278
- else:
279
- print(f" Image {idx}: βœ… {length} characters")
280
- print(f" Content: {content}...")
 
 
 
 
281
 
282
- # Table details
283
- if log_entry['table_texts']:
284
- print("\nπŸ“‹ TABLE DETAILS:")
285
- for tbl in log_entry['table_texts']:
286
- idx = tbl['table_index']
287
- length = tbl['table_length']
288
- content = tbl['table_content']
289
-
290
- print(f" Table {idx}: {length} characters")
291
- print(f" Content: {content}...")
292
 
293
- # Prompt details
294
- print("\nπŸ“ SUMMARIZATION PROMPT:")
295
- print(f" Total length: {log_entry['summary_prompt_length']:,} characters")
296
- print(f" Includes images: {'βœ… Yes' if log_entry['ocr_texts'] else '❌ No'}")
297
- print(f" Includes tables: {'βœ… Yes' if log_entry['table_texts'] else '❌ No'}")
298
 
299
- # Summary result
300
- print("\n✨ SUMMARY RESULT:")
301
- print(f" Length: {log_entry['summary_result_length']:,} characters")
302
- print(f" Content:")
303
- print(" " + "-"*66)
304
 
305
- # Print summary with line wrapping
306
- summary_lines = log_entry['summary_result'].split('\n')
307
- for line in summary_lines[:15]: # First 15 lines
308
- print(f" {line}")
 
 
 
309
 
310
- if len(summary_lines) > 15:
311
- print(f" ... ({len(summary_lines) - 15} more lines)")
312
 
313
- print(" " + "-"*66)
 
 
 
 
 
 
 
 
 
314
 
315
- print("\n" + "="*70)
316
-
317
- def get_summarization_log(self) -> List[Dict]:
318
- """Get all summarization logs"""
319
- return self.summarization_log
320
-
321
- def print_summarization_history(self):
322
- """Print all summarization logs"""
323
- print("\nπŸ“š SUMMARIZATION HISTORY:")
324
- print(f"Total summarizations: {len(self.summarization_log)}")
325
 
326
- for idx, log in enumerate(self.summarization_log, 1):
327
- print(f"\n{'='*70}")
328
- print(f"Summarization #{idx}")
329
- print(f"{'='*70}")
330
- self._print_summarization_log(log)
331
-
332
- def debug_search_results(self, search_results: List[Dict]) -> Dict:
333
- """Detailed analysis of search results for debugging"""
334
- analysis = {
335
- 'total_results': len(search_results),
336
- 'by_type': {'text': 0, 'image': 0, 'table': 0},
337
- 'average_distance': 0,
338
- 'images_with_content': 0,
339
- 'images_empty': 0,
340
- 'details': []
341
  }
342
 
343
- distances = []
 
344
 
345
- for idx, result in enumerate(search_results):
346
- content_type = result.get('type', 'unknown')
347
- content = result.get('content', '')
348
- distance = result.get('distance', 0)
349
-
350
- if content_type in analysis['by_type']:
351
- analysis['by_type'][content_type] += 1
352
-
353
- distances.append(distance)
354
-
355
- # Track image specifics
356
- if content_type == 'image':
357
- if content.strip():
358
- analysis['images_with_content'] += 1
359
- else:
360
- analysis['images_empty'] += 1
361
-
362
- analysis['details'].append({
363
- 'index': idx,
364
- 'type': content_type,
365
- 'distance': distance,
366
- 'content_length': len(content),
367
- 'has_content': bool(content.strip())
368
- })
369
 
370
- if distances:
371
- analysis['average_distance'] = sum(distances) / len(distances)
372
 
373
- self._debug_print("Search Results Analysis", analysis)
374
- return analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  def clear_history(self):
377
  """Clear conversation history"""
378
  self.conversation_history = []
379
  if self.debug:
380
- print("βœ… Conversation history cleared")
381
-
382
- def get_history(self) -> List:
383
- """Get conversation history"""
384
- return self.conversation_history
385
-
386
- def toggle_debug(self, enabled: bool):
387
- """Toggle debug mode on/off"""
388
- self.debug = enabled
389
- print(f"πŸ” Debug mode: {'ON' if enabled else 'OFF'}")
 
1
  """
2
+ Enhanced RAG System - Individual Summarization + Vector Store Persistence
3
+ Summarizes each image, text chunk, and table separately, then stores results
4
  """
5
  from typing import List, Dict
6
  from langchain_openai import ChatOpenAI
7
+ from langchain_core.messages import HumanMessage, SystemMessage
8
+ import hashlib
9
+ from config import (
10
+ OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
11
+ LANGUAGE, CACHE_RESPONSES, BATCH_SEARCH_RESULTS
12
+ )
13
 
14
 
15
  class MultimodalRAG:
16
+ """
17
+ RAG system that:
18
+ 1. Summarizes each component individually
19
+ 2. Stores summaries in vector store
20
+ 3. Enables fine-grained semantic search
21
+ """
22
 
23
  def __init__(self, api_key: str = None, debug: bool = True):
24
  api_key = api_key or OPENAI_API_KEY
 
33
 
34
  self.conversation_history = []
35
  self.language = LANGUAGE
36
+ self.summaries_log = []
37
 
38
  if self.debug:
39
+ print("βœ… EnhancedMultimodalRAG initialized")
40
 
41
  def _debug_print(self, label: str, data: any):
42
  """Print debug information"""
 
44
  print(f"\nπŸ” DEBUG [{label}]:")
45
  if isinstance(data, (list, dict)):
46
  print(f" Type: {type(data).__name__}")
47
+ print(f" Content: {str(data)[:300]}...")
48
  else:
49
  print(f" {data}")
50
 
51
+ def summarize_image(self, image_ocr_text: str, image_idx: int) -> str:
52
+ """
53
+ Summarize a single image's OCR text
54
+ Returns concise summary focused on image content
55
+ """
56
+ if not image_ocr_text or len(image_ocr_text.strip()) < 5:
57
+ return f"[Image {image_idx}: No readable text or empty content]"
58
 
59
+ try:
60
+ prompt = f"""Summarize this text extracted from an image in {self.language}.
61
+ Keep it concise but informative. Focus on key information, data, and visual elements.
62
+
63
+ Image OCR Text:
64
+ {image_ocr_text}
65
+
66
+ Summary (2-3 sentences maximum):"""
 
 
 
67
 
68
+ message = HumanMessage(content=prompt)
69
+ response = self.llm.invoke([message])
70
+ summary = response.content.strip()
 
 
 
 
71
 
72
+ if self.debug:
73
+ self._debug_print(f"Image {image_idx} Summary", summary)
 
 
74
 
75
+ return summary
76
+ except Exception as e:
77
+ error_msg = f"[Image {image_idx}: Summarization failed - {str(e)}]"
78
+ print(f"Error summarizing image {image_idx}: {e}")
79
+ return error_msg
80
+
81
+ def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
82
+ """
83
+ Chunk text and summarize each chunk individually
84
+ Returns list of {chunk_text, summary, type, index}
85
+ """
86
+ chunks = []
87
+
88
+ # Split text into chunks
89
+ text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
90
 
91
+ self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
 
 
92
 
93
+ for idx, chunk in enumerate(text_chunks):
94
+ if len(chunk.strip()) < 50: # Skip very small chunks
95
+ continue
96
+
97
+ try:
98
+ # Summarize chunk
99
+ prompt = f"""Summarize this text chunk in {self.language}.
100
+ Keep it concise. Extract key points, facts, and main ideas.
101
 
102
+ Text Chunk:
103
+ {chunk}
104
+
105
+ Summary (2-3 sentences maximum):"""
 
 
 
 
 
 
 
 
 
106
 
107
+ message = HumanMessage(content=prompt)
108
+ response = self.llm.invoke([message])
109
+ summary = response.content.strip()
110
+
111
+ chunks.append({
112
+ 'type': 'text_chunk',
113
+ 'chunk_index': len(chunks),
114
+ 'original_text': chunk[:500], # Store first 500 chars
115
+ 'summary': summary,
116
+ 'chunk_length': len(chunk)
117
+ })
118
+
119
+ if self.debug:
120
+ self._debug_print(f"Text Chunk {len(chunks)-1} Summary", summary)
121
+
122
+ except Exception as e:
123
+ print(f"Error summarizing text chunk: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ return chunks
 
 
 
126
 
127
+ def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
128
+ """
129
+ Summarize each table individually
130
+ Returns list of {table_content, summary, type, index}
131
+ """
132
+ summaries = []
133
+
134
+ for idx, table in enumerate(tables):
135
+ table_content = table.get('content', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ if not table_content or len(table_content.strip()) < 10:
138
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
+ try:
141
+ # Summarize table
142
+ prompt = f"""Analyze and summarize this table/structured data in {self.language}.
143
+ Extract key insights, row/column meanings, and important figures.
144
 
145
+ Table Content:
146
+ {table_content}
147
 
148
+ Summary (2-3 sentences maximum):"""
149
+
150
+ message = HumanMessage(content=prompt)
151
+ response = self.llm.invoke([message])
152
+ summary = response.content.strip()
153
+
154
+ summaries.append({
155
+ 'type': 'table',
156
+ 'table_index': idx,
157
+ 'original_content': table_content[:500],
158
+ 'summary': summary,
159
+ 'table_length': len(table_content)
160
+ })
161
+
162
+ if self.debug:
163
+ self._debug_print(f"Table {idx} Summary", summary)
164
+
165
+ except Exception as e:
166
+ print(f"Error summarizing table {idx}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ return summaries
 
 
 
169
 
170
+ def summarize_images(self, images: List[Dict]) -> List[Dict]:
171
+ """
172
+ Summarize each image individually
173
+ Returns list of {image_index, ocr_text, summary, type}
174
+ """
175
+ summaries = []
176
 
177
+ for idx, image in enumerate(images):
178
+ ocr_text = image.get('ocr_text', '')
179
+ summary = self.summarize_image(ocr_text, idx)
180
+
181
+ summaries.append({
182
+ 'type': 'image',
183
+ 'image_index': idx,
184
+ 'original_ocr': ocr_text[:500],
185
+ 'summary': summary,
186
+ 'ocr_length': len(ocr_text)
187
+ })
188
 
189
+ return summaries
190
+
191
+ def process_and_store_document(
192
+ self,
193
+ text: str,
194
+ images: List[Dict],
195
+ tables: List[Dict],
196
+ vector_store,
197
+ doc_id: str
198
+ ) -> Dict:
199
+ """
200
+ Main function: Summarize all components and store in vector store
201
+ Returns summary statistics
202
+ """
203
+ print(f"\n{'='*70}")
204
+ print(f"PROCESSING AND STORING: {doc_id}")
205
+ print(f"{'='*70}")
206
 
207
+ results = {
208
+ 'doc_id': doc_id,
209
+ 'image_summaries': [],
210
+ 'text_summaries': [],
211
+ 'table_summaries': [],
212
+ 'total_stored': 0
213
+ }
 
 
 
214
 
215
+ # 1. Summarize and store images
216
+ print(f"\nπŸ–ΌοΈ PROCESSING IMAGES ({len(images)} total)")
217
+ print(f"{'─'*70}")
 
 
218
 
219
+ image_summaries = self.summarize_images(images)
220
+ results['image_summaries'] = image_summaries
 
 
 
221
 
222
+ # Store each image summary in vector store
223
+ image_docs = {
224
+ 'text': ' | '.join([f"Image {s['image_index']}: {s['summary']}"
225
+ for s in image_summaries]),
226
+ 'images': [],
227
+ 'tables': []
228
+ }
229
 
230
+ for summary in image_summaries:
231
+ print(f" βœ… Image {summary['image_index']}: {summary['summary'][:50]}...")
232
 
233
+ if image_summaries:
234
+ try:
235
+ vector_store.add_documents(
236
+ image_docs,
237
+ f"{doc_id}_images"
238
+ )
239
+ results['total_stored'] += len(image_summaries)
240
+ print(f"βœ… Stored {len(image_summaries)} image summaries")
241
+ except Exception as e:
242
+ print(f"❌ Error storing image summaries: {e}")
243
 
244
+ # 2. Summarize and store text chunks
245
+ print(f"\nπŸ“ PROCESSING TEXT CHUNKS")
246
+ print(f"{'─'*70}")
 
 
 
 
 
 
 
247
 
248
+ text_summaries = self.summarize_text_chunks(text)
249
+ results['text_summaries'] = text_summaries
250
+
251
+ # Store each text chunk summary in vector store
252
+ text_docs = {
253
+ 'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
254
+ for s in text_summaries]),
255
+ 'images': [],
256
+ 'tables': []
 
 
 
 
 
 
257
  }
258
 
259
+ for summary in text_summaries:
260
+ print(f" βœ… Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
261
 
262
+ if text_summaries:
263
+ try:
264
+ vector_store.add_documents(
265
+ text_docs,
266
+ f"{doc_id}_text_chunks"
267
+ )
268
+ results['total_stored'] += len(text_summaries)
269
+ print(f"βœ… Stored {len(text_summaries)} text chunk summaries")
270
+ except Exception as e:
271
+ print(f"❌ Error storing text summaries: {e}")
272
+
273
+ # 3. Summarize and store tables
274
+ print(f"\nπŸ“‹ PROCESSING TABLES ({len(tables)} total)")
275
+ print(f"{'─'*70}")
 
 
 
 
 
 
 
 
 
 
276
 
277
+ table_summaries = self.summarize_tables(tables)
278
+ results['table_summaries'] = table_summaries
279
 
280
+ # Store each table summary in vector store
281
+ table_docs = {
282
+ 'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
283
+ for s in table_summaries]),
284
+ 'images': [],
285
+ 'tables': []
286
+ }
287
+
288
+ for summary in table_summaries:
289
+ print(f" βœ… Table {summary['table_index']}: {summary['summary'][:50]}...")
290
+
291
+ if table_summaries:
292
+ try:
293
+ vector_store.add_documents(
294
+ table_docs,
295
+ f"{doc_id}_tables"
296
+ )
297
+ results['total_stored'] += len(table_summaries)
298
+ print(f"βœ… Stored {len(table_summaries)} table summaries")
299
+ except Exception as e:
300
+ print(f"❌ Error storing table summaries: {e}")
301
+
302
+ # 4. Summary statistics
303
+ print(f"\n{'='*70}")
304
+ print(f"πŸ“Š STORAGE SUMMARY")
305
+ print(f"{'='*70}")
306
+ print(f" Images summarized & stored: {len(image_summaries)}")
307
+ print(f" Text chunks summarized & stored: {len(text_summaries)}")
308
+ print(f" Tables summarized & stored: {len(table_summaries)}")
309
+ print(f" Total items stored: {results['total_stored']}")
310
+ print(f"{'='*70}")
311
+
312
+ self.summaries_log.append(results)
313
+ return results
314
+
315
+ def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
316
+ """Split text into overlapping chunks"""
317
+ chunks = []
318
+ start = 0
319
+ while start < len(text):
320
+ end = start + chunk_size
321
+ chunks.append(text[start:end])
322
+ start = end - overlap
323
+ return chunks
324
+
325
+ def get_summaries_log(self) -> List[Dict]:
326
+ """Get all processing logs"""
327
+ return self.summaries_log
328
 
329
  def clear_history(self):
330
  """Clear conversation history"""
331
  self.conversation_history = []
332
  if self.debug:
333
+ print("βœ… Conversation history cleared")