Spaces:

kamkol
/

AB_Testing_RAG

Sleeping

App Files Files Community

kamkol commited on Apr 11, 2025

Commit

a2b3704

1 Parent(s): db2c124

Fix metadata association to display correct page numbers in sources attempt 2

Browse files

Files changed (1) hide show

app.py +103 -18

app.py CHANGED Viewed

@@ -27,6 +27,10 @@ Question:
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
 class RetrievalAugmentedQAPipeline:
     def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase, metadata: List[Dict[str, Any]] = None, texts: List[str] = None) -> None:
         self.llm = llm
@@ -34,17 +38,51 @@ class RetrievalAugmentedQAPipeline:
         self.metadata = metadata or []
         self.text_to_metadata = {}
-        # Ensure we have the original texts that match the metadata
-        if metadata and texts and len(texts) == len(metadata):
-            # Create a direct mapping from text to its metadata using the original texts
-            for i, text in enumerate(texts):
-                self.text_to_metadata[text] = metadata[i]
             print(f"Successfully mapped {len(self.text_to_metadata)} text chunks to metadata")
         else:
-            print(f"Warning: Metadata mapping not created. Metadata: {len(metadata) if metadata else 0}, Texts: {len(texts) if texts else 0}")
     async def arun_pipeline(self, user_query: str):
         context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
         context_prompt = ""
         sources = []
@@ -53,21 +91,44 @@ class RetrievalAugmentedQAPipeline:
             text = context[0]
             context_prompt += text + "\n"
-            # Get metadata for this text if available
-            if text in self.text_to_metadata:
-                sources.append(self.text_to_metadata[text])
             else:
                 # If exact text not found, try finding most similar text
-                # This is a fallback mechanism
                 found = False
                 for orig_text, meta in self.text_to_metadata.items():
-                    # Simple overlap check - if 80% of the text matches
-                    if len(set(text.split()).intersection(set(orig_text.split()))) / max(len(set(text.split())), 1) > 0.8:
-                        sources.append(meta)
-                        found = True
-                        break
                 if not found:
                     sources.append({"filename": "unknown", "page": "unknown"})
         formatted_system_prompt = system_role_prompt.create_message()
@@ -91,17 +152,41 @@ def load_preprocessed_data():
     with open('data/preprocessed_data.pkl', 'rb') as f:
         data = pickle.load(f)
     # Create a new vector database
     vector_db = VectorDatabase()
-    # Directly populate the vectors dictionary
-    for key, vector in data['vectors'].items():
-        vector_db.insert(key, vector)
     # Get metadata and original texts if available
     metadata = data.get('metadata', [])
     texts = data.get('texts', [])
     return vector_db, metadata, texts
 @cl.on_chat_start

 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
+def normalize_text(text):
+    """Normalize text for better matching by removing extra whitespace and converting to lowercase"""
+    return ' '.join(text.lower().split())
 class RetrievalAugmentedQAPipeline:
     def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase, metadata: List[Dict[str, Any]] = None, texts: List[str] = None) -> None:
         self.llm = llm
         self.metadata = metadata or []
         self.text_to_metadata = {}
+        # Debug info about input data
+        print(f"Init with metadata length: {len(metadata) if metadata else 0}, texts length: {len(texts) if texts else 0}")
+        # Enhanced text-to-metadata mapping with normalization
+        if metadata and texts and len(metadata) > 0:
+            # Create normalized versions of texts for better matching
+            normalized_texts = [normalize_text(t) for t in texts]
+            # First, try exact mapping if lengths match
+            if len(texts) == len(metadata):
+                print(f"Creating direct mapping with {len(texts)} texts")
+                for i, text in enumerate(texts):
+                    self.text_to_metadata[normalize_text(text)] = metadata[i]
+            # Otherwise map by tracking which PDF and page each chunk is from
+            else:
+                print(f"WARN: Length mismatch between texts ({len(texts)}) and metadata ({len(metadata)})")
+                current_file = None
+                current_page = None
+                for i, meta in enumerate(metadata):
+                    if i < len(normalized_texts):
+                        self.text_to_metadata[normalized_texts[i]] = meta
+                        # Track current file and page for debugging
+                        if current_file != meta['filename'] or current_page != meta['page']:
+                            current_file = meta['filename']
+                            current_page = meta['page']
+                            print(f"File: {current_file}, Page: {current_page}")
             print(f"Successfully mapped {len(self.text_to_metadata)} text chunks to metadata")
+            # Sample a few mappings for verification
+            sample_size = min(3, len(self.text_to_metadata))
+            sample_items = list(self.text_to_metadata.items())[:sample_size]
+            for i, (text, meta) in enumerate(sample_items):
+                print(f"Sample {i+1}: {text[:50]}... -> {meta}")
         else:
+            print(f"WARNING: Metadata mapping not created. Metadata: {len(metadata) if metadata else 0}, Texts: {len(texts) if texts else 0}")
     async def arun_pipeline(self, user_query: str):
         context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
+        # Debug: print the first retrieved context
+        if context_list:
+            print(f"Retrieved context: {context_list[0][0][:100]}...")
         context_prompt = ""
         sources = []
             text = context[0]
             context_prompt += text + "\n"
+            # Normalize the text for better matching
+            normalized_text = normalize_text(text)
+            # Get metadata for this text if available using normalized text
+            if normalized_text in self.text_to_metadata:
+                sources.append(self.text_to_metadata[normalized_text])
+                print(f"✓ Found exact metadata match for: {normalized_text[:50]}...")
             else:
                 # If exact text not found, try finding most similar text
+                print(f"× No exact match for: {normalized_text[:50]}...")
                 found = False
+                best_match = None
+                best_score = 0
+                # Try fuzzy matching
                 for orig_text, meta in self.text_to_metadata.items():
+                    # Calculate overlap score
+                    text_words = set(normalized_text.split())
+                    orig_words = set(orig_text.split())
+                    if not text_words or not orig_words:
+                        continue
+                    overlap = len(text_words.intersection(orig_words))
+                    score = overlap / max(len(text_words), len(orig_words))
+                    if score > best_score and score > 0.5:  # Minimum 50% word overlap
+                        best_score = score
+                        best_match = meta
+                if best_match:
+                    sources.append(best_match)
+                    print(f"✓ Found fuzzy match with score {best_score:.2f}")
+                    found = True
                 if not found:
+                    print("× No match found at all")
                     sources.append({"filename": "unknown", "page": "unknown"})
         formatted_system_prompt = system_role_prompt.create_message()
     with open('data/preprocessed_data.pkl', 'rb') as f:
         data = pickle.load(f)
+    # Debug info about the file contents
+    print(f"Loaded preprocessed data with keys: {list(data.keys())}")
     # Create a new vector database
     vector_db = VectorDatabase()
+    # Check that vectors dictionary has data
+    if 'vectors' in data and data['vectors']:
+        print(f"Vectors dictionary has {len(data['vectors'])} entries")
+        # Directly populate the vectors dictionary
+        for key, vector in data['vectors'].items():
+            vector_db.insert(key, vector)
+    else:
+        print("WARNING: No vectors found in preprocessed data")
     # Get metadata and original texts if available
     metadata = data.get('metadata', [])
     texts = data.get('texts', [])
+    print(f"Loaded {len(metadata)} metadata entries and {len(texts)} texts")
+    # Verify a sample of metadata to debug page numbering
+    if metadata and len(metadata) > 0:
+        page_counts = {}
+        for meta in metadata:
+            filename = meta.get('filename', 'unknown')
+            page = meta.get('page', 'unknown')
+            if filename not in page_counts:
+                page_counts[filename] = set()
+            page_counts[filename].add(page)
+        print(f"Found {len(page_counts)} unique files with pages:")
+        for filename, pages in page_counts.items():
+            print(f"  - {filename}: {len(pages)} unique pages (min: {min(pages)}, max: {max(pages)})")
     return vector_db, metadata, texts
 @cl.on_chat_start