dnj0 commited on
Commit
5f78fd3
·
verified ·
1 Parent(s): aaa4e39

Update src/rag_system.py

Browse files
Files changed (1) hide show
  1. src/rag_system.py +147 -90
src/rag_system.py CHANGED
@@ -1,42 +1,46 @@
1
  """
2
- Enhanced RAG System - Individual Summarization + Vector Store Persistence
3
- Summarizes each image, text chunk, and table separately, then stores results
 
4
  """
5
  from typing import List, Dict
6
  from langchain_openai import ChatOpenAI
7
  from langchain_core.messages import HumanMessage, SystemMessage
8
- import hashlib
 
 
9
  from config import (
10
  OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
11
- LANGUAGE, CACHE_RESPONSES, BATCH_SEARCH_RESULTS
12
  )
13
 
14
 
15
  class MultimodalRAG:
16
  """
17
  RAG system that:
18
- 1. Summarizes each component individually
19
- 2. Stores summaries in vector store
20
- 3. Enables fine-grained semantic search
 
21
  """
22
 
23
  def __init__(self, api_key: str = None, debug: bool = True):
24
  api_key = api_key or OPENAI_API_KEY
25
  self.debug = debug
26
 
 
27
  self.llm = ChatOpenAI(
28
- model_name=OPENAI_MODEL,
29
  api_key=api_key,
30
  temperature=TEMPERATURE,
31
  max_tokens=MAX_TOKENS,
32
  )
33
 
34
- self.conversation_history = []
35
  self.language = LANGUAGE
36
- self.summaries_log = []
37
 
38
  if self.debug:
39
- print("✅ EnhancedMultimodalRAG initialized")
40
 
41
  def _debug_print(self, label: str, data: any):
42
  """Print debug information"""
@@ -48,54 +52,134 @@ class MultimodalRAG:
48
  else:
49
  print(f" {data}")
50
 
51
- def summarize_image(self, image_ocr_text: str, image_idx: int) -> str:
 
 
 
 
 
 
 
 
 
 
52
  """
53
- Summarize a single image's OCR text
54
- Returns concise summary focused on image content
 
 
 
 
 
 
 
 
55
  """
56
- if not image_ocr_text or len(image_ocr_text.strip()) < 5:
57
- return f"[Image {image_idx}: No readable text or empty content]"
58
 
59
  try:
60
- prompt = f"""Summarize this text extracted from an image in {self.language}.
61
- Keep it concise but informative. Focus on key information, data, and visual elements.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- Image OCR Text:
64
- {image_ocr_text}
 
 
 
 
65
 
66
- Summary (2-3 sentences maximum):"""
 
 
 
 
 
67
 
68
- message = HumanMessage(content=prompt)
69
  response = self.llm.invoke([message])
70
- summary = response.content.strip()
71
 
72
  if self.debug:
73
- self._debug_print(f"Image {image_idx} Summary", summary)
 
 
 
74
 
75
- return summary
76
  except Exception as e:
77
- error_msg = f"[Image {image_idx}: Summarization failed - {str(e)}]"
78
- print(f"Error summarizing image {image_idx}: {e}")
79
  return error_msg
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
82
  """
83
  Chunk text and summarize each chunk individually
84
- Returns list of {chunk_text, summary, type, index}
85
  """
86
  chunks = []
87
-
88
- # Split text into chunks
89
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
90
 
91
  self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
92
 
93
  for idx, chunk in enumerate(text_chunks):
94
- if len(chunk.strip()) < 50: # Skip very small chunks
95
  continue
96
 
97
  try:
98
- # Summarize chunk
99
  prompt = f"""Summarize this text chunk in {self.language}.
100
  Keep it concise. Extract key points, facts, and main ideas.
101
 
@@ -111,7 +195,7 @@ Summary (2-3 sentences maximum):"""
111
  chunks.append({
112
  'type': 'text_chunk',
113
  'chunk_index': len(chunks),
114
- 'original_text': chunk[:500], # Store first 500 chars
115
  'summary': summary,
116
  'chunk_length': len(chunk)
117
  })
@@ -127,7 +211,6 @@ Summary (2-3 sentences maximum):"""
127
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
128
  """
129
  Summarize each table individually
130
- Returns list of {table_content, summary, type, index}
131
  """
132
  summaries = []
133
 
@@ -138,7 +221,6 @@ Summary (2-3 sentences maximum):"""
138
  continue
139
 
140
  try:
141
- # Summarize table
142
  prompt = f"""Analyze and summarize this table/structured data in {self.language}.
143
  Extract key insights, row/column meanings, and important figures.
144
 
@@ -167,27 +249,6 @@ Summary (2-3 sentences maximum):"""
167
 
168
  return summaries
169
 
170
- def summarize_images(self, images: List[Dict]) -> List[Dict]:
171
- """
172
- Summarize each image individually
173
- Returns list of {image_index, ocr_text, summary, type}
174
- """
175
- summaries = []
176
-
177
- for idx, image in enumerate(images):
178
- ocr_text = image.get('ocr_text', '')
179
- summary = self.summarize_image(ocr_text, idx)
180
-
181
- summaries.append({
182
- 'type': 'image',
183
- 'image_index': idx,
184
- 'original_ocr': ocr_text[:500],
185
- 'summary': summary,
186
- 'ocr_length': len(ocr_text)
187
- })
188
-
189
- return summaries
190
-
191
  def process_and_store_document(
192
  self,
193
  text: str,
@@ -197,58 +258,61 @@ Summary (2-3 sentences maximum):"""
197
  doc_id: str
198
  ) -> Dict:
199
  """
200
- Main function: Summarize all components and store in vector store
201
- Returns summary statistics
202
  """
203
  print(f"\n{'='*70}")
204
- print(f"PROCESSING AND STORING: {doc_id}")
205
  print(f"{'='*70}")
206
 
207
  results = {
208
  'doc_id': doc_id,
209
- 'image_summaries': [],
210
  'text_summaries': [],
211
  'table_summaries': [],
212
  'total_stored': 0
213
  }
214
 
215
- # 1. Summarize and store images
216
- print(f"\n🖼️ PROCESSING IMAGES ({len(images)} total)")
217
  print(f"{'─'*70}")
218
 
219
- image_summaries = self.summarize_images(images)
220
- results['image_summaries'] = image_summaries
221
 
222
- # Store each image summary in vector store
223
  image_docs = {
224
- 'text': ' | '.join([f"Image {s['image_index']}: {s['summary']}"
225
- for s in image_summaries]),
 
 
226
  'images': [],
227
  'tables': []
228
  }
229
 
230
- for summary in image_summaries:
231
- print(f" ✅ Image {summary['image_index']}: {summary['summary'][:50]}...")
 
 
232
 
233
- if image_summaries:
234
  try:
235
  vector_store.add_documents(
236
  image_docs,
237
- f"{doc_id}_images"
238
  )
239
- results['total_stored'] += len(image_summaries)
240
- print(f"✅ Stored {len(image_summaries)} image summaries")
241
  except Exception as e:
242
- print(f"❌ Error storing image summaries: {e}")
243
 
244
  # 2. Summarize and store text chunks
245
- print(f"\n📝 PROCESSING TEXT CHUNKS")
246
  print(f"{'─'*70}")
247
 
248
  text_summaries = self.summarize_text_chunks(text)
249
  results['text_summaries'] = text_summaries
250
 
251
- # Store each text chunk summary in vector store
252
  text_docs = {
253
  'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
254
  for s in text_summaries]),
@@ -271,13 +335,12 @@ Summary (2-3 sentences maximum):"""
271
  print(f"❌ Error storing text summaries: {e}")
272
 
273
  # 3. Summarize and store tables
274
- print(f"\n📋 PROCESSING TABLES ({len(tables)} total)")
275
  print(f"{'─'*70}")
276
 
277
  table_summaries = self.summarize_tables(tables)
278
  results['table_summaries'] = table_summaries
279
 
280
- # Store each table summary in vector store
281
  table_docs = {
282
  'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
283
  for s in table_summaries]),
@@ -303,13 +366,13 @@ Summary (2-3 sentences maximum):"""
303
  print(f"\n{'='*70}")
304
  print(f"📊 STORAGE SUMMARY")
305
  print(f"{'='*70}")
306
- print(f" Images summarized & stored: {len(image_summaries)}")
307
  print(f" Text chunks summarized & stored: {len(text_summaries)}")
308
  print(f" Tables summarized & stored: {len(table_summaries)}")
309
- print(f" Total items stored: {results['total_stored']}")
310
  print(f"{'='*70}")
311
 
312
- self.summaries_log.append(results)
313
  return results
314
 
315
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
@@ -322,12 +385,6 @@ Summary (2-3 sentences maximum):"""
322
  start = end - overlap
323
  return chunks
324
 
325
- def get_summaries_log(self) -> List[Dict]:
326
- """Get all processing logs"""
327
- return self.summaries_log
328
-
329
- def clear_history(self):
330
- """Clear conversation history"""
331
- self.conversation_history = []
332
- if self.debug:
333
- print("✅ Conversation history cleared")
 
1
  """
2
+ Enhanced RAG System - Visual Image Analysis
3
+ Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
4
+ Then stores results in vector store
5
  """
6
  from typing import List, Dict
7
  from langchain_openai import ChatOpenAI
8
  from langchain_core.messages import HumanMessage, SystemMessage
9
+ import base64
10
+ import os
11
+ from pathlib import Path
12
  from config import (
13
  OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS,
14
+ LANGUAGE, CHROMA_DB_PATH
15
  )
16
 
17
 
18
  class MultimodalRAG:
19
  """
20
  RAG system that:
21
+ 1. Sends images as base64 to GPT-4o for visual analysis
22
+ 2. Gets detailed visual descriptions and insights
23
+ 3. Stores visual analysis in vector store
24
+ 4. Enables image-based semantic search
25
  """
26
 
27
  def __init__(self, api_key: str = None, debug: bool = True):
28
  api_key = api_key or OPENAI_API_KEY
29
  self.debug = debug
30
 
31
+ # Use gpt-4o for vision capabilities
32
  self.llm = ChatOpenAI(
33
+ model_name="gpt-4o", # CRITICAL: gpt-4o has vision
34
  api_key=api_key,
35
  temperature=TEMPERATURE,
36
  max_tokens=MAX_TOKENS,
37
  )
38
 
 
39
  self.language = LANGUAGE
40
+ self.visual_summaries_log = []
41
 
42
  if self.debug:
43
+ print("✅ VisualMultimodalRAG initialized with gpt-4o (vision model)")
44
 
45
  def _debug_print(self, label: str, data: any):
46
  """Print debug information"""
 
52
  else:
53
  print(f" {data}")
54
 
55
+ def _image_to_base64(self, image_path: str) -> str:
56
+ """Convert image file to base64 string"""
57
+ try:
58
+ with open(image_path, 'rb') as image_file:
59
+ image_data = base64.b64encode(image_file.read()).decode('utf-8')
60
+ return image_data
61
+ except Exception as e:
62
+ print(f"Error converting image to base64: {e}")
63
+ return None
64
+
65
+ def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
66
  """
67
+ Send actual image (base64) to gpt-4o for visual analysis
68
+ Returns detailed visual analysis/description
69
+
70
+ gpt-4o can see:
71
+ - Charts, graphs, diagrams
72
+ - Tables and structured data
73
+ - Photos and drawings
74
+ - Handwritten text
75
+ - Screenshots
76
+ - Any visual content
77
  """
78
+ if not os.path.exists(image_path):
79
+ return f"[Image {image_idx}: File not found - {image_path}]"
80
 
81
  try:
82
+ # Convert image to base64
83
+ image_base64 = self._image_to_base64(image_path)
84
+ if not image_base64:
85
+ return f"[Image {image_idx}: Could not convert to base64]"
86
+
87
+ # Determine image type
88
+ file_ext = Path(image_path).suffix.lower()
89
+ media_type_map = {
90
+ '.jpg': 'image/jpeg',
91
+ '.jpeg': 'image/jpeg',
92
+ '.png': 'image/png',
93
+ '.gif': 'image/gif',
94
+ '.webp': 'image/webp'
95
+ }
96
+ media_type = media_type_map.get(file_ext, 'image/png')
97
+
98
+ print(f"🔍 Analyzing image {image_idx} visually (as {media_type})...")
99
+
100
+ # Create message with image
101
+ message = HumanMessage(
102
+ content=[
103
+ {
104
+ "type": "image_url",
105
+ "image_url": {
106
+ "url": f"data:{media_type};base64,{image_base64}",
107
+ },
108
+ },
109
+ {
110
+ "type": "text",
111
+ "text": f"""Analyze this image in detail in {self.language}.
112
 
113
+ Provide a comprehensive visual analysis including:
114
+ 1. **What you see** - Main objects, elements, structure
115
+ 2. **Data/Content** - Any numbers, text, charts, graphs
116
+ 3. **Purpose** - What this image is showing or representing
117
+ 4. **Key insights** - Important patterns, trends, or information
118
+ 5. **Connections** - How this relates to document content
119
 
120
+ Be specific and detailed. Focus on visual information that cannot be extracted from text alone.
121
+
122
+ Analysis:"""
123
+ }
124
+ ],
125
+ )
126
 
127
+ # Call gpt-4o with vision
128
  response = self.llm.invoke([message])
129
+ analysis = response.content.strip()
130
 
131
  if self.debug:
132
+ self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
133
+
134
+ print(f"✅ Image {image_idx} analyzed successfully")
135
+ return analysis
136
 
 
137
  except Exception as e:
138
+ error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
139
+ print(f"Error analyzing image {image_idx}: {e}")
140
  return error_msg
141
 
142
+ def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
143
+ """
144
+ Analyze each image visually using gpt-4o vision
145
+ Returns list of {image_index, visual_analysis, type}
146
+ """
147
+ visual_analyses = []
148
+
149
+ for idx, image in enumerate(images):
150
+ image_path = image.get('path', '')
151
+
152
+ if not image_path:
153
+ print(f"⚠️ Image {idx}: No path provided")
154
+ continue
155
+
156
+ # Analyze image visually (not just OCR)
157
+ visual_analysis = self.analyze_image_visually(image_path, idx)
158
+
159
+ visual_analyses.append({
160
+ 'type': 'image_visual',
161
+ 'image_index': idx,
162
+ 'image_path': image_path,
163
+ 'visual_analysis': visual_analysis,
164
+ 'ocr_text': image.get('ocr_text', '') # Keep OCR as backup
165
+ })
166
+
167
+ return visual_analyses
168
+
169
  def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
170
  """
171
  Chunk text and summarize each chunk individually
 
172
  """
173
  chunks = []
 
 
174
  text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
175
 
176
  self._debug_print("Text Chunking", f"Created {len(text_chunks)} chunks")
177
 
178
  for idx, chunk in enumerate(text_chunks):
179
+ if len(chunk.strip()) < 50:
180
  continue
181
 
182
  try:
 
183
  prompt = f"""Summarize this text chunk in {self.language}.
184
  Keep it concise. Extract key points, facts, and main ideas.
185
 
 
195
  chunks.append({
196
  'type': 'text_chunk',
197
  'chunk_index': len(chunks),
198
+ 'original_text': chunk[:500],
199
  'summary': summary,
200
  'chunk_length': len(chunk)
201
  })
 
211
  def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
212
  """
213
  Summarize each table individually
 
214
  """
215
  summaries = []
216
 
 
221
  continue
222
 
223
  try:
 
224
  prompt = f"""Analyze and summarize this table/structured data in {self.language}.
225
  Extract key insights, row/column meanings, and important figures.
226
 
 
249
 
250
  return summaries
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  def process_and_store_document(
253
  self,
254
  text: str,
 
258
  doc_id: str
259
  ) -> Dict:
260
  """
261
+ Main function: Analyze all components visually and store in vector store
262
+ Images are analyzed using gpt-4o vision (not just OCR)
263
  """
264
  print(f"\n{'='*70}")
265
+ print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
266
  print(f"{'='*70}")
267
 
268
  results = {
269
  'doc_id': doc_id,
270
+ 'image_visual_analyses': [],
271
  'text_summaries': [],
272
  'table_summaries': [],
273
  'total_stored': 0
274
  }
275
 
276
+ # 1. Analyze images VISUALLY using gpt-4o
277
+ print(f"\n🖼️ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
278
  print(f"{'─'*70}")
279
 
280
+ image_analyses = self.analyze_images_visually(images)
281
+ results['image_visual_analyses'] = image_analyses
282
 
283
+ # Store each image analysis in vector store
284
  image_docs = {
285
+ 'text': ' | '.join([
286
+ f"Image {a['image_index']}: {a['visual_analysis']}"
287
+ for a in image_analyses
288
+ ]),
289
  'images': [],
290
  'tables': []
291
  }
292
 
293
+ for analysis in image_analyses:
294
+ print(f" ✅ Image {analysis['image_index']} (visual analysis)")
295
+ print(f" Path: {analysis['image_path']}")
296
+ print(f" Analysis: {analysis['visual_analysis'][:100]}...")
297
 
298
+ if image_analyses:
299
  try:
300
  vector_store.add_documents(
301
  image_docs,
302
+ f"{doc_id}_images_visual"
303
  )
304
+ results['total_stored'] += len(image_analyses)
305
+ print(f"✅ Stored {len(image_analyses)} image visual analyses")
306
  except Exception as e:
307
+ print(f"❌ Error storing image analyses: {e}")
308
 
309
  # 2. Summarize and store text chunks
310
+ print(f"\n📝 TEXT CHUNK SUMMARIZATION")
311
  print(f"{'─'*70}")
312
 
313
  text_summaries = self.summarize_text_chunks(text)
314
  results['text_summaries'] = text_summaries
315
 
 
316
  text_docs = {
317
  'text': ' | '.join([f"Chunk {s['chunk_index']}: {s['summary']}"
318
  for s in text_summaries]),
 
335
  print(f"❌ Error storing text summaries: {e}")
336
 
337
  # 3. Summarize and store tables
338
+ print(f"\n📋 TABLE SUMMARIZATION ({len(tables)} total)")
339
  print(f"{'─'*70}")
340
 
341
  table_summaries = self.summarize_tables(tables)
342
  results['table_summaries'] = table_summaries
343
 
 
344
  table_docs = {
345
  'text': ' | '.join([f"Table {s['table_index']}: {s['summary']}"
346
  for s in table_summaries]),
 
366
  print(f"\n{'='*70}")
367
  print(f"📊 STORAGE SUMMARY")
368
  print(f"{'='*70}")
369
+ print(f" Images analyzed visually & stored: {len(image_analyses)}")
370
  print(f" Text chunks summarized & stored: {len(text_summaries)}")
371
  print(f" Tables summarized & stored: {len(table_summaries)}")
372
+ print(f" Total items stored in vector: {results['total_stored']}")
373
  print(f"{'='*70}")
374
 
375
+ self.visual_summaries_log.append(results)
376
  return results
377
 
378
  def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
 
385
  start = end - overlap
386
  return chunks
387
 
388
+ def get_visual_summaries_log(self) -> List[Dict]:
389
+ """Get all visual analysis logs"""
390
+ return self.visual_summaries_log