Spaces:
Sleeping
Sleeping
Update src/rag_system.py
Browse files- src/rag_system.py +126 -4
src/rag_system.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
LLM Integration Module using OpenAI GPT-4o and LangChain
|
| 3 |
-
FIXED for LangChain 0.1+ with IMAGE DEBUGGING
|
| 4 |
"""
|
| 5 |
from typing import List, Dict
|
| 6 |
from langchain_openai import ChatOpenAI
|
|
@@ -25,6 +25,7 @@ class MultimodalRAG:
|
|
| 25 |
|
| 26 |
self.conversation_history = []
|
| 27 |
self.language = LANGUAGE
|
|
|
|
| 28 |
|
| 29 |
if self.debug:
|
| 30 |
print("✅ MultimodalRAG initialized with DEBUG mode ON")
|
|
@@ -131,7 +132,7 @@ class MultimodalRAG:
|
|
| 131 |
return f"Error: Could not generate answer. {str(e)}"
|
| 132 |
|
| 133 |
def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
|
| 134 |
-
"""Summarize extracted document content including images and tables"""
|
| 135 |
try:
|
| 136 |
if images is None:
|
| 137 |
images = []
|
|
@@ -141,14 +142,40 @@ class MultimodalRAG:
|
|
| 141 |
self._debug_print("Document Summarization Started",
|
| 142 |
f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
# Extract OCR text from images
|
| 145 |
image_ocr_texts = []
|
| 146 |
for idx, img in enumerate(images):
|
| 147 |
ocr_text = img.get('ocr_text', '')
|
| 148 |
if ocr_text:
|
| 149 |
image_ocr_texts.append(f"Image {idx}: {ocr_text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
self._debug_print(f"Image {idx} OCR", ocr_text[:100])
|
| 151 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
|
| 153 |
|
| 154 |
# Extract table content
|
|
@@ -157,6 +184,11 @@ class MultimodalRAG:
|
|
| 157 |
table_content = tbl.get('content', '')
|
| 158 |
if table_content:
|
| 159 |
table_texts.append(f"Table {idx}:\n{table_content}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
self._debug_print(f"Table {idx} Content", table_content[:100])
|
| 161 |
else:
|
| 162 |
self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
|
|
@@ -190,6 +222,8 @@ Please include in your summary:
|
|
| 190 |
5. Key information from tables (if present)
|
| 191 |
6. Overall document purpose"""
|
| 192 |
|
|
|
|
|
|
|
| 193 |
self._debug_print("Summary Prompt Length", len(summary_prompt))
|
| 194 |
self._debug_print("Summary Prompt Content", summary_prompt[:200])
|
| 195 |
|
|
@@ -197,16 +231,104 @@ Please include in your summary:
|
|
| 197 |
self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
|
| 198 |
|
| 199 |
response = self.llm.invoke([message])
|
|
|
|
| 200 |
|
| 201 |
-
|
|
|
|
| 202 |
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
except Exception as e:
|
| 206 |
self._debug_print("ERROR in summarize_document", str(e))
|
| 207 |
print(f"Error summarizing document: {e}")
|
| 208 |
return f"Error: Could not summarize document. {str(e)}"
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
def debug_search_results(self, search_results: List[Dict]) -> Dict:
|
| 211 |
"""Detailed analysis of search results for debugging"""
|
| 212 |
analysis = {
|
|
|
|
| 1 |
"""
|
| 2 |
LLM Integration Module using OpenAI GPT-4o and LangChain
|
| 3 |
+
FIXED for LangChain 0.1+ with IMAGE DEBUGGING + RESULT LOGGING
|
| 4 |
"""
|
| 5 |
from typing import List, Dict
|
| 6 |
from langchain_openai import ChatOpenAI
|
|
|
|
| 25 |
|
| 26 |
self.conversation_history = []
|
| 27 |
self.language = LANGUAGE
|
| 28 |
+
self.summarization_log = [] # Track all summarizations
|
| 29 |
|
| 30 |
if self.debug:
|
| 31 |
print("✅ MultimodalRAG initialized with DEBUG mode ON")
|
|
|
|
| 132 |
return f"Error: Could not generate answer. {str(e)}"
|
| 133 |
|
| 134 |
def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
|
| 135 |
+
"""Summarize extracted document content including images and tables with logging"""
|
| 136 |
try:
|
| 137 |
if images is None:
|
| 138 |
images = []
|
|
|
|
| 142 |
self._debug_print("Document Summarization Started",
|
| 143 |
f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
|
| 144 |
|
| 145 |
+
# Log entry
|
| 146 |
+
log_entry = {
|
| 147 |
+
'document_text_length': len(document_content),
|
| 148 |
+
'total_images': len(images),
|
| 149 |
+
'total_tables': len(tables),
|
| 150 |
+
'images_with_ocr': 0,
|
| 151 |
+
'images_empty_ocr': 0,
|
| 152 |
+
'ocr_texts': [],
|
| 153 |
+
'table_texts': [],
|
| 154 |
+
'summary_prompt_length': 0,
|
| 155 |
+
'summary_result': '',
|
| 156 |
+
'summary_result_length': 0
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
# Extract OCR text from images
|
| 160 |
image_ocr_texts = []
|
| 161 |
for idx, img in enumerate(images):
|
| 162 |
ocr_text = img.get('ocr_text', '')
|
| 163 |
if ocr_text:
|
| 164 |
image_ocr_texts.append(f"Image {idx}: {ocr_text}")
|
| 165 |
+
log_entry['images_with_ocr'] += 1
|
| 166 |
+
log_entry['ocr_texts'].append({
|
| 167 |
+
'image_index': idx,
|
| 168 |
+
'ocr_length': len(ocr_text),
|
| 169 |
+
'ocr_content': ocr_text[:200] # First 200 chars
|
| 170 |
+
})
|
| 171 |
self._debug_print(f"Image {idx} OCR", ocr_text[:100])
|
| 172 |
else:
|
| 173 |
+
log_entry['images_empty_ocr'] += 1
|
| 174 |
+
log_entry['ocr_texts'].append({
|
| 175 |
+
'image_index': idx,
|
| 176 |
+
'ocr_length': 0,
|
| 177 |
+
'ocr_content': 'EMPTY'
|
| 178 |
+
})
|
| 179 |
self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
|
| 180 |
|
| 181 |
# Extract table content
|
|
|
|
| 184 |
table_content = tbl.get('content', '')
|
| 185 |
if table_content:
|
| 186 |
table_texts.append(f"Table {idx}:\n{table_content}")
|
| 187 |
+
log_entry['table_texts'].append({
|
| 188 |
+
'table_index': idx,
|
| 189 |
+
'table_length': len(table_content),
|
| 190 |
+
'table_content': table_content[:200]
|
| 191 |
+
})
|
| 192 |
self._debug_print(f"Table {idx} Content", table_content[:100])
|
| 193 |
else:
|
| 194 |
self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
|
|
|
|
| 222 |
5. Key information from tables (if present)
|
| 223 |
6. Overall document purpose"""
|
| 224 |
|
| 225 |
+
log_entry['summary_prompt_length'] = len(summary_prompt)
|
| 226 |
+
|
| 227 |
self._debug_print("Summary Prompt Length", len(summary_prompt))
|
| 228 |
self._debug_print("Summary Prompt Content", summary_prompt[:200])
|
| 229 |
|
|
|
|
| 231 |
self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
|
| 232 |
|
| 233 |
response = self.llm.invoke([message])
|
| 234 |
+
summary = response.content
|
| 235 |
|
| 236 |
+
log_entry['summary_result'] = summary
|
| 237 |
+
log_entry['summary_result_length'] = len(summary)
|
| 238 |
|
| 239 |
+
self._debug_print("Summary Response Length", len(summary))
|
| 240 |
+
|
| 241 |
+
# PRINT DETAILED SUMMARIZATION LOG
|
| 242 |
+
self._print_summarization_log(log_entry)
|
| 243 |
+
|
| 244 |
+
# Store in log
|
| 245 |
+
self.summarization_log.append(log_entry)
|
| 246 |
+
|
| 247 |
+
return summary
|
| 248 |
|
| 249 |
except Exception as e:
|
| 250 |
self._debug_print("ERROR in summarize_document", str(e))
|
| 251 |
print(f"Error summarizing document: {e}")
|
| 252 |
return f"Error: Could not summarize document. {str(e)}"
|
| 253 |
|
| 254 |
+
def _print_summarization_log(self, log_entry: Dict):
|
| 255 |
+
"""Print detailed summarization results log"""
|
| 256 |
+
print("\n" + "="*70)
|
| 257 |
+
print("📊 IMAGE SUMMARIZATION LOG")
|
| 258 |
+
print("="*70)
|
| 259 |
+
|
| 260 |
+
# Document composition
|
| 261 |
+
print("\n📄 DOCUMENT COMPOSITION:")
|
| 262 |
+
print(f" Text: {log_entry['document_text_length']:,} characters")
|
| 263 |
+
print(f" Images: {log_entry['total_images']} total")
|
| 264 |
+
print(f" ✅ With OCR text: {log_entry['images_with_ocr']}")
|
| 265 |
+
print(f" ⚠️ Empty OCR: {log_entry['images_empty_ocr']}")
|
| 266 |
+
print(f" Tables: {log_entry['total_tables']} total")
|
| 267 |
+
|
| 268 |
+
# Image OCR details
|
| 269 |
+
if log_entry['ocr_texts']:
|
| 270 |
+
print("\n🖼️ IMAGE OCR TEXT DETAILS:")
|
| 271 |
+
for ocr in log_entry['ocr_texts']:
|
| 272 |
+
idx = ocr['image_index']
|
| 273 |
+
length = ocr['ocr_length']
|
| 274 |
+
content = ocr['ocr_content']
|
| 275 |
+
|
| 276 |
+
if length == 0:
|
| 277 |
+
print(f" Image {idx}: ⚠️ EMPTY (0 chars)")
|
| 278 |
+
else:
|
| 279 |
+
print(f" Image {idx}: ✅ {length} characters")
|
| 280 |
+
print(f" Content: {content}...")
|
| 281 |
+
|
| 282 |
+
# Table details
|
| 283 |
+
if log_entry['table_texts']:
|
| 284 |
+
print("\n📋 TABLE DETAILS:")
|
| 285 |
+
for tbl in log_entry['table_texts']:
|
| 286 |
+
idx = tbl['table_index']
|
| 287 |
+
length = tbl['table_length']
|
| 288 |
+
content = tbl['table_content']
|
| 289 |
+
|
| 290 |
+
print(f" Table {idx}: {length} characters")
|
| 291 |
+
print(f" Content: {content}...")
|
| 292 |
+
|
| 293 |
+
# Prompt details
|
| 294 |
+
print("\n📝 SUMMARIZATION PROMPT:")
|
| 295 |
+
print(f" Total length: {log_entry['summary_prompt_length']:,} characters")
|
| 296 |
+
print(f" Includes images: {'✅ Yes' if log_entry['ocr_texts'] else '❌ No'}")
|
| 297 |
+
print(f" Includes tables: {'✅ Yes' if log_entry['table_texts'] else '❌ No'}")
|
| 298 |
+
|
| 299 |
+
# Summary result
|
| 300 |
+
print("\n✨ SUMMARY RESULT:")
|
| 301 |
+
print(f" Length: {log_entry['summary_result_length']:,} characters")
|
| 302 |
+
print(f" Content:")
|
| 303 |
+
print(" " + "-"*66)
|
| 304 |
+
|
| 305 |
+
# Print summary with line wrapping
|
| 306 |
+
summary_lines = log_entry['summary_result'].split('\n')
|
| 307 |
+
for line in summary_lines[:15]: # First 15 lines
|
| 308 |
+
print(f" {line}")
|
| 309 |
+
|
| 310 |
+
if len(summary_lines) > 15:
|
| 311 |
+
print(f" ... ({len(summary_lines) - 15} more lines)")
|
| 312 |
+
|
| 313 |
+
print(" " + "-"*66)
|
| 314 |
+
|
| 315 |
+
print("\n" + "="*70)
|
| 316 |
+
|
| 317 |
+
def get_summarization_log(self) -> List[Dict]:
|
| 318 |
+
"""Get all summarization logs"""
|
| 319 |
+
return self.summarization_log
|
| 320 |
+
|
| 321 |
+
def print_summarization_history(self):
|
| 322 |
+
"""Print all summarization logs"""
|
| 323 |
+
print("\n📚 SUMMARIZATION HISTORY:")
|
| 324 |
+
print(f"Total summarizations: {len(self.summarization_log)}")
|
| 325 |
+
|
| 326 |
+
for idx, log in enumerate(self.summarization_log, 1):
|
| 327 |
+
print(f"\n{'='*70}")
|
| 328 |
+
print(f"Summarization #{idx}")
|
| 329 |
+
print(f"{'='*70}")
|
| 330 |
+
self._print_summarization_log(log)
|
| 331 |
+
|
| 332 |
def debug_search_results(self, search_results: List[Dict]) -> Dict:
|
| 333 |
"""Detailed analysis of search results for debugging"""
|
| 334 |
analysis = {
|