dnj0 commited on
Commit
fc31dcc
·
verified ·
1 Parent(s): bf6fcc4

Update src/rag_system.py

Browse files
Files changed (1) hide show
  1. src/rag_system.py +126 -4
src/rag_system.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  LLM Integration Module using OpenAI GPT-4o and LangChain
3
- FIXED for LangChain 0.1+ with IMAGE DEBUGGING
4
  """
5
  from typing import List, Dict
6
  from langchain_openai import ChatOpenAI
@@ -25,6 +25,7 @@ class MultimodalRAG:
25
 
26
  self.conversation_history = []
27
  self.language = LANGUAGE
 
28
 
29
  if self.debug:
30
  print("✅ MultimodalRAG initialized with DEBUG mode ON")
@@ -131,7 +132,7 @@ class MultimodalRAG:
131
  return f"Error: Could not generate answer. {str(e)}"
132
 
133
  def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
134
- """Summarize extracted document content including images and tables"""
135
  try:
136
  if images is None:
137
  images = []
@@ -141,14 +142,40 @@ class MultimodalRAG:
141
  self._debug_print("Document Summarization Started",
142
  f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  # Extract OCR text from images
145
  image_ocr_texts = []
146
  for idx, img in enumerate(images):
147
  ocr_text = img.get('ocr_text', '')
148
  if ocr_text:
149
  image_ocr_texts.append(f"Image {idx}: {ocr_text}")
 
 
 
 
 
 
150
  self._debug_print(f"Image {idx} OCR", ocr_text[:100])
151
  else:
 
 
 
 
 
 
152
  self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
153
 
154
  # Extract table content
@@ -157,6 +184,11 @@ class MultimodalRAG:
157
  table_content = tbl.get('content', '')
158
  if table_content:
159
  table_texts.append(f"Table {idx}:\n{table_content}")
 
 
 
 
 
160
  self._debug_print(f"Table {idx} Content", table_content[:100])
161
  else:
162
  self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
@@ -190,6 +222,8 @@ Please include in your summary:
190
  5. Key information from tables (if present)
191
  6. Overall document purpose"""
192
 
 
 
193
  self._debug_print("Summary Prompt Length", len(summary_prompt))
194
  self._debug_print("Summary Prompt Content", summary_prompt[:200])
195
 
@@ -197,16 +231,104 @@ Please include in your summary:
197
  self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
198
 
199
  response = self.llm.invoke([message])
 
200
 
201
- self._debug_print("Summary Response Length", len(response.content))
 
202
 
203
- return response.content
 
 
 
 
 
 
 
 
204
 
205
  except Exception as e:
206
  self._debug_print("ERROR in summarize_document", str(e))
207
  print(f"Error summarizing document: {e}")
208
  return f"Error: Could not summarize document. {str(e)}"
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def debug_search_results(self, search_results: List[Dict]) -> Dict:
211
  """Detailed analysis of search results for debugging"""
212
  analysis = {
 
1
  """
2
  LLM Integration Module using OpenAI GPT-4o and LangChain
3
+ FIXED for LangChain 0.1+ with IMAGE DEBUGGING + RESULT LOGGING
4
  """
5
  from typing import List, Dict
6
  from langchain_openai import ChatOpenAI
 
25
 
26
  self.conversation_history = []
27
  self.language = LANGUAGE
28
+ self.summarization_log = [] # Track all summarizations
29
 
30
  if self.debug:
31
  print("✅ MultimodalRAG initialized with DEBUG mode ON")
 
132
  return f"Error: Could not generate answer. {str(e)}"
133
 
134
  def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
135
+ """Summarize extracted document content including images and tables with logging"""
136
  try:
137
  if images is None:
138
  images = []
 
142
  self._debug_print("Document Summarization Started",
143
  f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
144
 
145
+ # Log entry
146
+ log_entry = {
147
+ 'document_text_length': len(document_content),
148
+ 'total_images': len(images),
149
+ 'total_tables': len(tables),
150
+ 'images_with_ocr': 0,
151
+ 'images_empty_ocr': 0,
152
+ 'ocr_texts': [],
153
+ 'table_texts': [],
154
+ 'summary_prompt_length': 0,
155
+ 'summary_result': '',
156
+ 'summary_result_length': 0
157
+ }
158
+
159
  # Extract OCR text from images
160
  image_ocr_texts = []
161
  for idx, img in enumerate(images):
162
  ocr_text = img.get('ocr_text', '')
163
  if ocr_text:
164
  image_ocr_texts.append(f"Image {idx}: {ocr_text}")
165
+ log_entry['images_with_ocr'] += 1
166
+ log_entry['ocr_texts'].append({
167
+ 'image_index': idx,
168
+ 'ocr_length': len(ocr_text),
169
+ 'ocr_content': ocr_text[:200] # First 200 chars
170
+ })
171
  self._debug_print(f"Image {idx} OCR", ocr_text[:100])
172
  else:
173
+ log_entry['images_empty_ocr'] += 1
174
+ log_entry['ocr_texts'].append({
175
+ 'image_index': idx,
176
+ 'ocr_length': 0,
177
+ 'ocr_content': 'EMPTY'
178
+ })
179
  self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
180
 
181
  # Extract table content
 
184
  table_content = tbl.get('content', '')
185
  if table_content:
186
  table_texts.append(f"Table {idx}:\n{table_content}")
187
+ log_entry['table_texts'].append({
188
+ 'table_index': idx,
189
+ 'table_length': len(table_content),
190
+ 'table_content': table_content[:200]
191
+ })
192
  self._debug_print(f"Table {idx} Content", table_content[:100])
193
  else:
194
  self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
 
222
  5. Key information from tables (if present)
223
  6. Overall document purpose"""
224
 
225
+ log_entry['summary_prompt_length'] = len(summary_prompt)
226
+
227
  self._debug_print("Summary Prompt Length", len(summary_prompt))
228
  self._debug_print("Summary Prompt Content", summary_prompt[:200])
229
 
 
231
  self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
232
 
233
  response = self.llm.invoke([message])
234
+ summary = response.content
235
 
236
+ log_entry['summary_result'] = summary
237
+ log_entry['summary_result_length'] = len(summary)
238
 
239
+ self._debug_print("Summary Response Length", len(summary))
240
+
241
+ # PRINT DETAILED SUMMARIZATION LOG
242
+ self._print_summarization_log(log_entry)
243
+
244
+ # Store in log
245
+ self.summarization_log.append(log_entry)
246
+
247
+ return summary
248
 
249
  except Exception as e:
250
  self._debug_print("ERROR in summarize_document", str(e))
251
  print(f"Error summarizing document: {e}")
252
  return f"Error: Could not summarize document. {str(e)}"
253
 
254
+ def _print_summarization_log(self, log_entry: Dict):
255
+ """Print detailed summarization results log"""
256
+ print("\n" + "="*70)
257
+ print("📊 IMAGE SUMMARIZATION LOG")
258
+ print("="*70)
259
+
260
+ # Document composition
261
+ print("\n📄 DOCUMENT COMPOSITION:")
262
+ print(f" Text: {log_entry['document_text_length']:,} characters")
263
+ print(f" Images: {log_entry['total_images']} total")
264
+ print(f" ✅ With OCR text: {log_entry['images_with_ocr']}")
265
+ print(f" ⚠️ Empty OCR: {log_entry['images_empty_ocr']}")
266
+ print(f" Tables: {log_entry['total_tables']} total")
267
+
268
+ # Image OCR details
269
+ if log_entry['ocr_texts']:
270
+ print("\n🖼️ IMAGE OCR TEXT DETAILS:")
271
+ for ocr in log_entry['ocr_texts']:
272
+ idx = ocr['image_index']
273
+ length = ocr['ocr_length']
274
+ content = ocr['ocr_content']
275
+
276
+ if length == 0:
277
+ print(f" Image {idx}: ⚠️ EMPTY (0 chars)")
278
+ else:
279
+ print(f" Image {idx}: ✅ {length} characters")
280
+ print(f" Content: {content}...")
281
+
282
+ # Table details
283
+ if log_entry['table_texts']:
284
+ print("\n📋 TABLE DETAILS:")
285
+ for tbl in log_entry['table_texts']:
286
+ idx = tbl['table_index']
287
+ length = tbl['table_length']
288
+ content = tbl['table_content']
289
+
290
+ print(f" Table {idx}: {length} characters")
291
+ print(f" Content: {content}...")
292
+
293
+ # Prompt details
294
+ print("\n📝 SUMMARIZATION PROMPT:")
295
+ print(f" Total length: {log_entry['summary_prompt_length']:,} characters")
296
+ print(f" Includes images: {'✅ Yes' if log_entry['ocr_texts'] else '❌ No'}")
297
+ print(f" Includes tables: {'✅ Yes' if log_entry['table_texts'] else '❌ No'}")
298
+
299
+ # Summary result
300
+ print("\n✨ SUMMARY RESULT:")
301
+ print(f" Length: {log_entry['summary_result_length']:,} characters")
302
+ print(f" Content:")
303
+ print(" " + "-"*66)
304
+
305
+ # Print summary with line wrapping
306
+ summary_lines = log_entry['summary_result'].split('\n')
307
+ for line in summary_lines[:15]: # First 15 lines
308
+ print(f" {line}")
309
+
310
+ if len(summary_lines) > 15:
311
+ print(f" ... ({len(summary_lines) - 15} more lines)")
312
+
313
+ print(" " + "-"*66)
314
+
315
+ print("\n" + "="*70)
316
+
317
+ def get_summarization_log(self) -> List[Dict]:
318
+ """Get all summarization logs"""
319
+ return self.summarization_log
320
+
321
+ def print_summarization_history(self):
322
+ """Print all summarization logs"""
323
+ print("\n📚 SUMMARIZATION HISTORY:")
324
+ print(f"Total summarizations: {len(self.summarization_log)}")
325
+
326
+ for idx, log in enumerate(self.summarization_log, 1):
327
+ print(f"\n{'='*70}")
328
+ print(f"Summarization #{idx}")
329
+ print(f"{'='*70}")
330
+ self._print_summarization_log(log)
331
+
332
  def debug_search_results(self, search_results: List[Dict]) -> Dict:
333
  """Detailed analysis of search results for debugging"""
334
  analysis = {