dnj0 commited on
Commit
719ab54
Β·
verified Β·
1 Parent(s): ed4a4a3

Update src/rag_system.py

Browse files
Files changed (1) hide show
  1. src/rag_system.py +30 -116
src/rag_system.py CHANGED
@@ -1,8 +1,4 @@
1
- """
2
- Enhanced RAG System - Visual Image Analysis
3
- Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
4
- Then stores results in vector store
5
- """
6
  from typing import List, Dict
7
  from langchain_openai import ChatOpenAI
8
  from langchain_core.messages import HumanMessage, SystemMessage
@@ -16,21 +12,13 @@ from config import (
16
 
17
 
18
  class VisualMultimodalRAG:
19
- """
20
- RAG system that:
21
- 1. Sends images as base64 to GPT-4o for visual analysis
22
- 2. Gets detailed visual descriptions and insights
23
- 3. Stores visual analysis in vector store
24
- 4. Enables image-based semantic search
25
- """
26
-
27
  def __init__(self, api_key: str = None, debug: bool = True):
28
  api_key = api_key or OPENAI_API_KEY
29
  self.debug = debug
30
 
31
- # Use gpt-4o for vision capabilities
32
  self.llm = ChatOpenAI(
33
- model_name="gpt-4o-mini", # CRITICAL: gpt-4o has vision
34
  api_key=api_key,
35
  temperature=TEMPERATURE,
36
  max_tokens=MAX_TOKENS,
@@ -40,10 +28,9 @@ class VisualMultimodalRAG:
40
  self.visual_summaries_log = []
41
 
42
  if self.debug:
43
- print("βœ… VisualMultimodalRAG initialized with gpt-4o (vision model)")
44
 
45
  def _debug_print(self, label: str, data: any):
46
- """Print debug information"""
47
  if self.debug:
48
  print(f"\nπŸ” DEBUG [{label}]:")
49
  if isinstance(data, (list, dict)):
@@ -53,7 +40,6 @@ class VisualMultimodalRAG:
53
  print(f" {data}")
54
 
55
  def _image_to_base64(self, image_path: str) -> str:
56
- """Convert image file to base64 string"""
57
  try:
58
  with open(image_path, 'rb') as image_file:
59
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
@@ -63,28 +49,15 @@ class VisualMultimodalRAG:
63
  return None
64
 
65
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
66
- """
67
- Send actual image (base64) to gpt-4o for visual analysis
68
- Returns detailed visual analysis/description
69
-
70
- gpt-4o can see:
71
- - Charts, graphs, diagrams
72
- - Tables and structured data
73
- - Photos and drawings
74
- - Handwritten text
75
- - Screenshots
76
- - Any visual content
77
- """
78
  if not os.path.exists(image_path):
79
  return f"[Image {image_idx}: File not found - {image_path}]"
80
 
81
  try:
82
- # Convert image to base64
83
  image_base64 = self._image_to_base64(image_path)
84
  if not image_base64:
85
  return f"[Image {image_idx}: Could not convert to base64]"
86
 
87
- # Determine image type
88
  file_ext = Path(image_path).suffix.lower()
89
  media_type_map = {
90
  '.jpg': 'image/jpeg',
@@ -108,23 +81,21 @@ class VisualMultimodalRAG:
108
  },
109
  {
110
  "type": "text",
111
- "text": f"""Analyze this image in detail in {self.language}.
112
-
113
- Provide a comprehensive visual analysis including:
114
- 1. **What you see** - Main objects, elements, structure
115
- 2. **Data/Content** - Any numbers, text, charts, graphs
116
- 3. **Purpose** - What this image is showing or representing
117
- 4. **Key insights** - Important patterns, trends, or information
118
- 5. **Connections** - How this relates to document content
119
-
120
- Be specific and detailed. Focus on visual information that cannot be extracted from text alone.
121
 
122
  Analysis:"""
123
  }
124
  ],
125
  )
126
 
127
- # Call gpt-4o with vision
128
  response = self.llm.invoke([message])
129
  analysis = response.content.strip()
130
 
@@ -140,10 +111,7 @@ Analysis:"""
140
  return error_msg
141
 
142
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
143
- """
144
- Analyze each image visually using gpt-4o vision
145
- Returns list of {image_index, visual_analysis, type}
146
- """
147
  visual_analyses = []
148
 
149
  for idx, image in enumerate(images):
@@ -153,7 +121,6 @@ Analysis:"""
153
  print(f"⚠️ Image {idx}: No path provided")
154
  continue
155
 
156
- # Analyze image visually (not just OCR)
157
  visual_analysis = self.analyze_image_visually(image_path, idx)
158
 
159
  visual_analyses.append({
@@ -167,9 +134,7 @@ Analysis:"""
167
  return visual_analyses
168
 
169
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
170
- """
171
- Chunk text and summarize each chunk individually
172
- """
173
  chunks = []
174
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
175
 
@@ -181,12 +146,12 @@ Analysis:"""
181
 
182
  try:
183
  prompt = f"""Summarize this text chunk in {self.language}.
184
- Keep it concise. Extract key points, facts, and main ideas.
185
 
186
  Text Chunk:
187
  {chunk}
188
 
189
- Summary (2-3 sentences maximum):"""
190
 
191
  message = HumanMessage(content=prompt)
192
  response = self.llm.invoke([message])
@@ -209,9 +174,6 @@ Summary (2-3 sentences maximum):"""
209
  return chunks
210
 
211
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
212
- """
213
- Summarize each table individually
214
- """
215
  summaries = []
216
 
217
  for idx, table in enumerate(tables):
@@ -222,12 +184,12 @@ Summary (2-3 sentences maximum):"""
222
 
223
  try:
224
  prompt = f"""Analyze and summarize this table/structured data in {self.language}.
225
- Extract key insights, row/column meanings, and important figures.
226
 
227
  Table Content:
228
  {table_content}
229
 
230
- Summary (2-3 sentences maximum):"""
231
 
232
  message = HumanMessage(content=prompt)
233
  response = self.llm.invoke([message])
@@ -257,10 +219,7 @@ Summary (2-3 sentences maximum):"""
257
  vector_store,
258
  doc_id: str
259
  ) -> Dict:
260
- """
261
- Main function: Analyze all components visually and store in vector store
262
- Images are analyzed using gpt-4o vision (not just OCR)
263
- """
264
  print(f"\n{'='*70}")
265
  print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
266
  print(f"{'='*70}")
@@ -273,14 +232,12 @@ Summary (2-3 sentences maximum):"""
273
  'total_stored': 0
274
  }
275
 
276
- # 1. Analyze images VISUALLY using gpt-4o
277
  print(f"\nπŸ–ΌοΈ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
278
  print(f"{'─'*70}")
279
 
280
  image_analyses = self.analyze_images_visually(images)
281
  results['image_visual_analyses'] = image_analyses
282
 
283
- # Store each image analysis in vector store
284
  image_docs = {
285
  'text': ' | '.join([
286
  f"Image {a['image_index']}: {a['visual_analysis']}"
@@ -306,7 +263,6 @@ Summary (2-3 sentences maximum):"""
306
  except Exception as e:
307
  print(f"❌ Error storing image analyses: {e}")
308
 
309
- # 2. Summarize and store text chunks
310
  print(f"\nπŸ“ TEXT CHUNK SUMMARIZATION")
311
  print(f"{'─'*70}")
312
 
@@ -334,7 +290,6 @@ Summary (2-3 sentences maximum):"""
334
  except Exception as e:
335
  print(f"❌ Error storing text summaries: {e}")
336
 
337
- # 3. Summarize and store tables
338
  print(f"\nπŸ“‹ TABLE SUMMARIZATION ({len(tables)} total)")
339
  print(f"{'─'*70}")
340
 
@@ -362,7 +317,6 @@ Summary (2-3 sentences maximum):"""
362
  except Exception as e:
363
  print(f"❌ Error storing table summaries: {e}")
364
 
365
- # 4. Summary statistics
366
  print(f"\n{'='*70}")
367
  print(f"πŸ“Š STORAGE SUMMARY")
368
  print(f"{'='*70}")
@@ -376,7 +330,6 @@ Summary (2-3 sentences maximum):"""
376
  return results
377
 
378
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
379
- """Split text into overlapping chunks"""
380
  chunks = []
381
  start = 0
382
  while start < len(text):
@@ -386,24 +339,18 @@ Summary (2-3 sentences maximum):"""
386
  return chunks
387
 
388
  def get_visual_summaries_log(self) -> List[Dict]:
389
- """Get all visual analysis logs"""
390
  return self.visual_summaries_log
391
 
392
 
393
  class AnsweringRAG:
394
- """
395
- RAG system that:
396
- 1. Searches vector store for relevant content
397
- 2. ANALYZES search results
398
- 3. Generates intelligent answers based on context
399
- """
400
 
401
  def __init__(self, api_key: str = None, debug: bool = True):
402
  api_key = api_key or OPENAI_API_KEY
403
  self.debug = debug
404
 
405
  self.llm = ChatOpenAI(
406
- model_name="gpt-4o-mini", # Use gpt-4o for better understanding
407
  api_key=api_key,
408
  temperature=TEMPERATURE,
409
  max_tokens=MAX_TOKENS,
@@ -413,10 +360,9 @@ class AnsweringRAG:
413
  self.answer_log = []
414
 
415
  if self.debug:
416
- print("βœ… AnsweringRAG initialized with answer generation")
417
 
418
  def _debug_print(self, label: str, data: any):
419
- """Print debug information"""
420
  if self.debug:
421
  print(f"\nπŸ” DEBUG [{label}]:")
422
  if isinstance(data, (list, dict)):
@@ -430,18 +376,7 @@ class AnsweringRAG:
430
  question: str,
431
  search_results: List[Dict]
432
  ) -> Dict:
433
- """
434
- Analyze search results and generate intelligent answer
435
-
436
- Returns:
437
- {
438
- 'question': user question,
439
- 'answer': detailed answer,
440
- 'sources_used': number of sources,
441
- 'confidence': low/medium/high,
442
- 'search_results': original search results
443
- }
444
- """
445
 
446
  print(f"\n{'='*70}")
447
  print(f"ANALYZING QUESTION & GENERATING ANSWER")
@@ -450,15 +385,10 @@ class AnsweringRAG:
450
  print(f"\n❓ Question: {question}")
451
  print(f"πŸ“Š Search Results Found: {len(search_results)}")
452
 
453
- # Check if we have search results
454
  if not search_results:
455
  print(f"⚠️ No search results found!")
456
- answer = f"""I could not find relevant information in the document to answer your question: "{question}"
457
-
458
- Try:
459
- - Using different keywords
460
- - Breaking the question into smaller parts
461
- - Asking about other topics in the document"""
462
 
463
  result = {
464
  'question': question,
@@ -470,7 +400,6 @@ Try:
470
  self.answer_log.append(result)
471
  return result
472
 
473
- # Build context from search results
474
  context_parts = []
475
  for idx, result in enumerate(search_results, 1):
476
  content = result.get('content', '')
@@ -487,11 +416,10 @@ Try:
487
 
488
  self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
489
 
490
- # Build prompt to analyze results and answer question
491
  analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
492
-
493
  USER QUESTION:
494
- "{question}"
495
 
496
  RELEVANT CONTENT FROM DOCUMENT:
497
  {full_context}
@@ -503,7 +431,6 @@ INSTRUCTIONS:
503
  4. If the content doesn't fully answer the question, explain what information is available
504
  5. Be specific and cite the content when relevant
505
  6. Structure your answer clearly with key points
506
-
507
  ANSWER:"""
508
 
509
  print(f"\nπŸ” Analyzing search results...")
@@ -511,12 +438,10 @@ ANSWER:"""
511
  print(f" Sources: {len(search_results)}")
512
 
513
  try:
514
- # Call LLM to analyze and answer
515
  message = HumanMessage(content=analysis_prompt)
516
  response = self.llm.invoke([message])
517
  answer = response.content.strip()
518
 
519
- # Determine confidence level
520
  confidence = self._estimate_confidence(len(search_results), answer)
521
 
522
  print(f"βœ… Answer generated successfully")
@@ -551,18 +476,14 @@ ANSWER:"""
551
  return result
552
 
553
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
554
- """Estimate confidence level of answer"""
555
  answer_length = len(answer)
556
 
557
- # High confidence: multiple sources, substantial answer
558
  if sources_count >= 3 and answer_length > 500:
559
  return "high"
560
 
561
- # Medium confidence: some sources, decent answer
562
  elif sources_count >= 2 and answer_length > 200:
563
  return "medium"
564
 
565
- # Low confidence: few sources or short answer
566
  else:
567
  return "low"
568
 
@@ -571,14 +492,9 @@ ANSWER:"""
571
  question: str,
572
  search_results: List[Dict]
573
  ) -> Dict:
574
- """
575
- Get answer AND properly formatted sources
576
- Returns both answer and formatted source citations
577
- """
578
-
579
  result = self.analyze_and_answer(question, search_results)
580
 
581
- # Format sources for display
582
  formatted_sources = []
583
  for idx, source in enumerate(result['search_results'], 1):
584
  formatted_sources.append({
@@ -592,11 +508,9 @@ ANSWER:"""
592
  return result
593
 
594
  def get_answer_log(self) -> List[Dict]:
595
- """Get all answer generation logs"""
596
  return self.answer_log
597
 
598
  def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
599
- """Pretty print answer with sources"""
600
 
601
  print(f"\n{'='*70}")
602
  print(f"ANSWER TO: {result['question']}")
 
1
+
 
 
 
 
2
  from typing import List, Dict
3
  from langchain_openai import ChatOpenAI
4
  from langchain_core.messages import HumanMessage, SystemMessage
 
12
 
13
 
14
  class VisualMultimodalRAG:
15
+
 
 
 
 
 
 
 
16
  def __init__(self, api_key: str = None, debug: bool = True):
17
  api_key = api_key or OPENAI_API_KEY
18
  self.debug = debug
19
 
 
20
  self.llm = ChatOpenAI(
21
+ model_name="gpt-4o-mini",
22
  api_key=api_key,
23
  temperature=TEMPERATURE,
24
  max_tokens=MAX_TOKENS,
 
28
  self.visual_summaries_log = []
29
 
30
  if self.debug:
31
+ print("βœ… VisualMultimodalRAG initialized")
32
 
33
  def _debug_print(self, label: str, data: any):
 
34
  if self.debug:
35
  print(f"\nπŸ” DEBUG [{label}]:")
36
  if isinstance(data, (list, dict)):
 
40
  print(f" {data}")
41
 
42
  def _image_to_base64(self, image_path: str) -> str:
 
43
  try:
44
  with open(image_path, 'rb') as image_file:
45
  image_data = base64.b64encode(image_file.read()).decode('utf-8')
 
49
  return None
50
 
51
  def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
52
+
 
 
 
 
 
 
 
 
 
 
 
53
  if not os.path.exists(image_path):
54
  return f"[Image {image_idx}: File not found - {image_path}]"
55
 
56
  try:
 
57
  image_base64 = self._image_to_base64(image_path)
58
  if not image_base64:
59
  return f"[Image {image_idx}: Could not convert to base64]"
60
 
 
61
  file_ext = Path(image_path).suffix.lower()
62
  media_type_map = {
63
  '.jpg': 'image/jpeg',
 
81
  },
82
  {
83
  "type": "text",
84
+ "text": f"""You are assistant for analyzing and aggregating information. Analyze this image.
85
+
86
+ Provide a visual analysis that includes:
87
+ 1. Main objects and element
88
+ 2. Data/Content - Any numbers, text, charts, graphs
89
+ 3. What this image is showing or representing
90
+ 4. Important patterns, trends, or information
91
+ 5. How image relates to document content
92
+ Be brief and meaningful. Focus on visual information that cannot be extracted from text. Response on {self.language}.
 
93
 
94
  Analysis:"""
95
  }
96
  ],
97
  )
98
 
 
99
  response = self.llm.invoke([message])
100
  analysis = response.content.strip()
101
 
 
111
  return error_msg
112
 
113
  def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
114
+
 
 
 
115
  visual_analyses = []
116
 
117
  for idx, image in enumerate(images):
 
121
  print(f"⚠️ Image {idx}: No path provided")
122
  continue
123
 
 
124
  visual_analysis = self.analyze_image_visually(image_path, idx)
125
 
126
  visual_analyses.append({
 
134
  return visual_analyses
135
 
136
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
137
+
 
 
138
  chunks = []
139
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
140
 
 
146
 
147
  try:
148
  prompt = f"""Summarize this text chunk in {self.language}.
149
+ Be brief and meaningful. Extract key points, facts, and main ideas.
150
 
151
  Text Chunk:
152
  {chunk}
153
 
154
+ Summary:"""
155
 
156
  message = HumanMessage(content=prompt)
157
  response = self.llm.invoke([message])
 
174
  return chunks
175
 
176
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
 
 
 
177
  summaries = []
178
 
179
  for idx, table in enumerate(tables):
 
184
 
185
  try:
186
  prompt = f"""Analyze and summarize this table/structured data in {self.language}.
187
+ Extract key insights, row/column meanings, and important figures. Be brief and meaningful.
188
 
189
  Table Content:
190
  {table_content}
191
 
192
+ Summary:"""
193
 
194
  message = HumanMessage(content=prompt)
195
  response = self.llm.invoke([message])
 
219
  vector_store,
220
  doc_id: str
221
  ) -> Dict:
222
+
 
 
 
223
  print(f"\n{'='*70}")
224
  print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
225
  print(f"{'='*70}")
 
232
  'total_stored': 0
233
  }
234
 
 
235
  print(f"\nπŸ–ΌοΈ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
236
  print(f"{'─'*70}")
237
 
238
  image_analyses = self.analyze_images_visually(images)
239
  results['image_visual_analyses'] = image_analyses
240
 
 
241
  image_docs = {
242
  'text': ' | '.join([
243
  f"Image {a['image_index']}: {a['visual_analysis']}"
 
263
  except Exception as e:
264
  print(f"❌ Error storing image analyses: {e}")
265
 
 
266
  print(f"\nπŸ“ TEXT CHUNK SUMMARIZATION")
267
  print(f"{'─'*70}")
268
 
 
290
  except Exception as e:
291
  print(f"❌ Error storing text summaries: {e}")
292
 
 
293
  print(f"\nπŸ“‹ TABLE SUMMARIZATION ({len(tables)} total)")
294
  print(f"{'─'*70}")
295
 
 
317
  except Exception as e:
318
  print(f"❌ Error storing table summaries: {e}")
319
 
 
320
  print(f"\n{'='*70}")
321
  print(f"πŸ“Š STORAGE SUMMARY")
322
  print(f"{'='*70}")
 
330
  return results
331
 
332
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
 
333
  chunks = []
334
  start = 0
335
  while start < len(text):
 
339
  return chunks
340
 
341
  def get_visual_summaries_log(self) -> List[Dict]:
 
342
  return self.visual_summaries_log
343
 
344
 
345
  class AnsweringRAG:
346
+
 
 
 
 
 
347
 
348
  def __init__(self, api_key: str = None, debug: bool = True):
349
  api_key = api_key or OPENAI_API_KEY
350
  self.debug = debug
351
 
352
  self.llm = ChatOpenAI(
353
+ model_name="gpt-4o-mini",
354
  api_key=api_key,
355
  temperature=TEMPERATURE,
356
  max_tokens=MAX_TOKENS,
 
360
  self.answer_log = []
361
 
362
  if self.debug:
363
+ print("βœ… AnsweringRAG initialized")
364
 
365
  def _debug_print(self, label: str, data: any):
 
366
  if self.debug:
367
  print(f"\nπŸ” DEBUG [{label}]:")
368
  if isinstance(data, (list, dict)):
 
376
  question: str,
377
  search_results: List[Dict]
378
  ) -> Dict:
379
+
 
 
 
 
 
 
 
 
 
 
 
380
 
381
  print(f"\n{'='*70}")
382
  print(f"ANALYZING QUESTION & GENERATING ANSWER")
 
385
  print(f"\n❓ Question: {question}")
386
  print(f"πŸ“Š Search Results Found: {len(search_results)}")
387
 
 
388
  if not search_results:
389
  print(f"⚠️ No search results found!")
390
+ answer = f"""No relevant information in the document to answer question: "{question}"
391
+ """
 
 
 
 
392
 
393
  result = {
394
  'question': question,
 
400
  self.answer_log.append(result)
401
  return result
402
 
 
403
  context_parts = []
404
  for idx, result in enumerate(search_results, 1):
405
  content = result.get('content', '')
 
416
 
417
  self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
418
 
 
419
  analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
420
+
421
  USER QUESTION:
422
+ {question}
423
 
424
  RELEVANT CONTENT FROM DOCUMENT:
425
  {full_context}
 
431
  4. If the content doesn't fully answer the question, explain what information is available
432
  5. Be specific and cite the content when relevant
433
  6. Structure your answer clearly with key points
 
434
  ANSWER:"""
435
 
436
  print(f"\nπŸ” Analyzing search results...")
 
438
  print(f" Sources: {len(search_results)}")
439
 
440
  try:
 
441
  message = HumanMessage(content=analysis_prompt)
442
  response = self.llm.invoke([message])
443
  answer = response.content.strip()
444
 
 
445
  confidence = self._estimate_confidence(len(search_results), answer)
446
 
447
  print(f"βœ… Answer generated successfully")
 
476
  return result
477
 
478
  def _estimate_confidence(self, sources_count: int, answer: str) -> str:
 
479
  answer_length = len(answer)
480
 
 
481
  if sources_count >= 3 and answer_length > 500:
482
  return "high"
483
 
 
484
  elif sources_count >= 2 and answer_length > 200:
485
  return "medium"
486
 
 
487
  else:
488
  return "low"
489
 
 
492
  question: str,
493
  search_results: List[Dict]
494
  ) -> Dict:
495
+
 
 
 
 
496
  result = self.analyze_and_answer(question, search_results)
497
 
 
498
  formatted_sources = []
499
  for idx, source in enumerate(result['search_results'], 1):
500
  formatted_sources.append({
 
508
  return result
509
 
510
  def get_answer_log(self) -> List[Dict]:
 
511
  return self.answer_log
512
 
513
  def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
 
514
 
515
  print(f"\n{'='*70}")
516
  print(f"ANSWER TO: {result['question']}")