kamkol commited on
Commit
a2b3704
·
1 Parent(s): db2c124

Fix metadata association to display correct page numbers in sources attempt 2

Browse files
Files changed (1) hide show
  1. app.py +103 -18
app.py CHANGED
@@ -27,6 +27,10 @@ Question:
27
  """
28
  user_role_prompt = UserRolePrompt(user_prompt_template)
29
 
 
 
 
 
30
  class RetrievalAugmentedQAPipeline:
31
  def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase, metadata: List[Dict[str, Any]] = None, texts: List[str] = None) -> None:
32
  self.llm = llm
@@ -34,17 +38,51 @@ class RetrievalAugmentedQAPipeline:
34
  self.metadata = metadata or []
35
  self.text_to_metadata = {}
36
 
37
- # Ensure we have the original texts that match the metadata
38
- if metadata and texts and len(texts) == len(metadata):
39
- # Create a direct mapping from text to its metadata using the original texts
40
- for i, text in enumerate(texts):
41
- self.text_to_metadata[text] = metadata[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  print(f"Successfully mapped {len(self.text_to_metadata)} text chunks to metadata")
 
 
 
 
 
 
43
  else:
44
- print(f"Warning: Metadata mapping not created. Metadata: {len(metadata) if metadata else 0}, Texts: {len(texts) if texts else 0}")
45
 
46
  async def arun_pipeline(self, user_query: str):
47
  context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
 
 
 
48
 
49
  context_prompt = ""
50
  sources = []
@@ -53,21 +91,44 @@ class RetrievalAugmentedQAPipeline:
53
  text = context[0]
54
  context_prompt += text + "\n"
55
 
56
- # Get metadata for this text if available
57
- if text in self.text_to_metadata:
58
- sources.append(self.text_to_metadata[text])
 
 
 
 
59
  else:
60
  # If exact text not found, try finding most similar text
61
- # This is a fallback mechanism
 
62
  found = False
 
 
 
 
63
  for orig_text, meta in self.text_to_metadata.items():
64
- # Simple overlap check - if 80% of the text matches
65
- if len(set(text.split()).intersection(set(orig_text.split()))) / max(len(set(text.split())), 1) > 0.8:
66
- sources.append(meta)
67
- found = True
68
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  if not found:
 
71
  sources.append({"filename": "unknown", "page": "unknown"})
72
 
73
  formatted_system_prompt = system_role_prompt.create_message()
@@ -91,17 +152,41 @@ def load_preprocessed_data():
91
  with open('data/preprocessed_data.pkl', 'rb') as f:
92
  data = pickle.load(f)
93
 
 
 
 
94
  # Create a new vector database
95
  vector_db = VectorDatabase()
96
 
97
- # Directly populate the vectors dictionary
98
- for key, vector in data['vectors'].items():
99
- vector_db.insert(key, vector)
 
 
 
 
 
100
 
101
  # Get metadata and original texts if available
102
  metadata = data.get('metadata', [])
103
  texts = data.get('texts', [])
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  return vector_db, metadata, texts
106
 
107
  @cl.on_chat_start
 
27
  """
28
  user_role_prompt = UserRolePrompt(user_prompt_template)
29
 
30
+ def normalize_text(text):
31
+ """Normalize text for better matching by removing extra whitespace and converting to lowercase"""
32
+ return ' '.join(text.lower().split())
33
+
34
  class RetrievalAugmentedQAPipeline:
35
  def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase, metadata: List[Dict[str, Any]] = None, texts: List[str] = None) -> None:
36
  self.llm = llm
 
38
  self.metadata = metadata or []
39
  self.text_to_metadata = {}
40
 
41
+ # Debug info about input data
42
+ print(f"Init with metadata length: {len(metadata) if metadata else 0}, texts length: {len(texts) if texts else 0}")
43
+
44
+ # Enhanced text-to-metadata mapping with normalization
45
+ if metadata and texts and len(metadata) > 0:
46
+ # Create normalized versions of texts for better matching
47
+ normalized_texts = [normalize_text(t) for t in texts]
48
+
49
+ # First, try exact mapping if lengths match
50
+ if len(texts) == len(metadata):
51
+ print(f"Creating direct mapping with {len(texts)} texts")
52
+ for i, text in enumerate(texts):
53
+ self.text_to_metadata[normalize_text(text)] = metadata[i]
54
+
55
+ # Otherwise map by tracking which PDF and page each chunk is from
56
+ else:
57
+ print(f"WARN: Length mismatch between texts ({len(texts)}) and metadata ({len(metadata)})")
58
+ current_file = None
59
+ current_page = None
60
+
61
+ for i, meta in enumerate(metadata):
62
+ if i < len(normalized_texts):
63
+ self.text_to_metadata[normalized_texts[i]] = meta
64
+
65
+ # Track current file and page for debugging
66
+ if current_file != meta['filename'] or current_page != meta['page']:
67
+ current_file = meta['filename']
68
+ current_page = meta['page']
69
+ print(f"File: {current_file}, Page: {current_page}")
70
+
71
  print(f"Successfully mapped {len(self.text_to_metadata)} text chunks to metadata")
72
+
73
+ # Sample a few mappings for verification
74
+ sample_size = min(3, len(self.text_to_metadata))
75
+ sample_items = list(self.text_to_metadata.items())[:sample_size]
76
+ for i, (text, meta) in enumerate(sample_items):
77
+ print(f"Sample {i+1}: {text[:50]}... -> {meta}")
78
  else:
79
+ print(f"WARNING: Metadata mapping not created. Metadata: {len(metadata) if metadata else 0}, Texts: {len(texts) if texts else 0}")
80
 
81
  async def arun_pipeline(self, user_query: str):
82
  context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
83
+ # Debug: print the first retrieved context
84
+ if context_list:
85
+ print(f"Retrieved context: {context_list[0][0][:100]}...")
86
 
87
  context_prompt = ""
88
  sources = []
 
91
  text = context[0]
92
  context_prompt += text + "\n"
93
 
94
+ # Normalize the text for better matching
95
+ normalized_text = normalize_text(text)
96
+
97
+ # Get metadata for this text if available using normalized text
98
+ if normalized_text in self.text_to_metadata:
99
+ sources.append(self.text_to_metadata[normalized_text])
100
+ print(f"✓ Found exact metadata match for: {normalized_text[:50]}...")
101
  else:
102
  # If exact text not found, try finding most similar text
103
+ print(f"× No exact match for: {normalized_text[:50]}...")
104
+
105
  found = False
106
+ best_match = None
107
+ best_score = 0
108
+
109
+ # Try fuzzy matching
110
  for orig_text, meta in self.text_to_metadata.items():
111
+ # Calculate overlap score
112
+ text_words = set(normalized_text.split())
113
+ orig_words = set(orig_text.split())
114
+
115
+ if not text_words or not orig_words:
116
+ continue
117
+
118
+ overlap = len(text_words.intersection(orig_words))
119
+ score = overlap / max(len(text_words), len(orig_words))
120
+
121
+ if score > best_score and score > 0.5: # Minimum 50% word overlap
122
+ best_score = score
123
+ best_match = meta
124
+
125
+ if best_match:
126
+ sources.append(best_match)
127
+ print(f"✓ Found fuzzy match with score {best_score:.2f}")
128
+ found = True
129
 
130
  if not found:
131
+ print("× No match found at all")
132
  sources.append({"filename": "unknown", "page": "unknown"})
133
 
134
  formatted_system_prompt = system_role_prompt.create_message()
 
152
  with open('data/preprocessed_data.pkl', 'rb') as f:
153
  data = pickle.load(f)
154
 
155
+ # Debug info about the file contents
156
+ print(f"Loaded preprocessed data with keys: {list(data.keys())}")
157
+
158
  # Create a new vector database
159
  vector_db = VectorDatabase()
160
 
161
+ # Check that vectors dictionary has data
162
+ if 'vectors' in data and data['vectors']:
163
+ print(f"Vectors dictionary has {len(data['vectors'])} entries")
164
+ # Directly populate the vectors dictionary
165
+ for key, vector in data['vectors'].items():
166
+ vector_db.insert(key, vector)
167
+ else:
168
+ print("WARNING: No vectors found in preprocessed data")
169
 
170
  # Get metadata and original texts if available
171
  metadata = data.get('metadata', [])
172
  texts = data.get('texts', [])
173
 
174
+ print(f"Loaded {len(metadata)} metadata entries and {len(texts)} texts")
175
+
176
+ # Verify a sample of metadata to debug page numbering
177
+ if metadata and len(metadata) > 0:
178
+ page_counts = {}
179
+ for meta in metadata:
180
+ filename = meta.get('filename', 'unknown')
181
+ page = meta.get('page', 'unknown')
182
+ if filename not in page_counts:
183
+ page_counts[filename] = set()
184
+ page_counts[filename].add(page)
185
+
186
+ print(f"Found {len(page_counts)} unique files with pages:")
187
+ for filename, pages in page_counts.items():
188
+ print(f" - {filename}: {len(pages)} unique pages (min: {min(pages)}, max: {max(pages)})")
189
+
190
  return vector_db, metadata, texts
191
 
192
  @cl.on_chat_start