Spaces:
Sleeping
Sleeping
Fix metadata association to display correct page numbers in sources attempt 2
Browse files
app.py
CHANGED
|
@@ -27,6 +27,10 @@ Question:
|
|
| 27 |
"""
|
| 28 |
user_role_prompt = UserRolePrompt(user_prompt_template)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
class RetrievalAugmentedQAPipeline:
|
| 31 |
def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase, metadata: List[Dict[str, Any]] = None, texts: List[str] = None) -> None:
|
| 32 |
self.llm = llm
|
|
@@ -34,17 +38,51 @@ class RetrievalAugmentedQAPipeline:
|
|
| 34 |
self.metadata = metadata or []
|
| 35 |
self.text_to_metadata = {}
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
if metadata
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
print(f"Successfully mapped {len(self.text_to_metadata)} text chunks to metadata")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
else:
|
| 44 |
-
print(f"
|
| 45 |
|
| 46 |
async def arun_pipeline(self, user_query: str):
|
| 47 |
context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
context_prompt = ""
|
| 50 |
sources = []
|
|
@@ -53,21 +91,44 @@ class RetrievalAugmentedQAPipeline:
|
|
| 53 |
text = context[0]
|
| 54 |
context_prompt += text + "\n"
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
else:
|
| 60 |
# If exact text not found, try finding most similar text
|
| 61 |
-
|
|
|
|
| 62 |
found = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
for orig_text, meta in self.text_to_metadata.items():
|
| 64 |
-
#
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
if not found:
|
|
|
|
| 71 |
sources.append({"filename": "unknown", "page": "unknown"})
|
| 72 |
|
| 73 |
formatted_system_prompt = system_role_prompt.create_message()
|
|
@@ -91,17 +152,41 @@ def load_preprocessed_data():
|
|
| 91 |
with open('data/preprocessed_data.pkl', 'rb') as f:
|
| 92 |
data = pickle.load(f)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
| 94 |
# Create a new vector database
|
| 95 |
vector_db = VectorDatabase()
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
# Get metadata and original texts if available
|
| 102 |
metadata = data.get('metadata', [])
|
| 103 |
texts = data.get('texts', [])
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
return vector_db, metadata, texts
|
| 106 |
|
| 107 |
@cl.on_chat_start
|
|
|
|
| 27 |
"""
|
| 28 |
user_role_prompt = UserRolePrompt(user_prompt_template)
|
| 29 |
|
| 30 |
+
def normalize_text(text):
|
| 31 |
+
"""Normalize text for better matching by removing extra whitespace and converting to lowercase"""
|
| 32 |
+
return ' '.join(text.lower().split())
|
| 33 |
+
|
| 34 |
class RetrievalAugmentedQAPipeline:
|
| 35 |
def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase, metadata: List[Dict[str, Any]] = None, texts: List[str] = None) -> None:
|
| 36 |
self.llm = llm
|
|
|
|
| 38 |
self.metadata = metadata or []
|
| 39 |
self.text_to_metadata = {}
|
| 40 |
|
| 41 |
+
# Debug info about input data
|
| 42 |
+
print(f"Init with metadata length: {len(metadata) if metadata else 0}, texts length: {len(texts) if texts else 0}")
|
| 43 |
+
|
| 44 |
+
# Enhanced text-to-metadata mapping with normalization
|
| 45 |
+
if metadata and texts and len(metadata) > 0:
|
| 46 |
+
# Create normalized versions of texts for better matching
|
| 47 |
+
normalized_texts = [normalize_text(t) for t in texts]
|
| 48 |
+
|
| 49 |
+
# First, try exact mapping if lengths match
|
| 50 |
+
if len(texts) == len(metadata):
|
| 51 |
+
print(f"Creating direct mapping with {len(texts)} texts")
|
| 52 |
+
for i, text in enumerate(texts):
|
| 53 |
+
self.text_to_metadata[normalize_text(text)] = metadata[i]
|
| 54 |
+
|
| 55 |
+
# Otherwise map by tracking which PDF and page each chunk is from
|
| 56 |
+
else:
|
| 57 |
+
print(f"WARN: Length mismatch between texts ({len(texts)}) and metadata ({len(metadata)})")
|
| 58 |
+
current_file = None
|
| 59 |
+
current_page = None
|
| 60 |
+
|
| 61 |
+
for i, meta in enumerate(metadata):
|
| 62 |
+
if i < len(normalized_texts):
|
| 63 |
+
self.text_to_metadata[normalized_texts[i]] = meta
|
| 64 |
+
|
| 65 |
+
# Track current file and page for debugging
|
| 66 |
+
if current_file != meta['filename'] or current_page != meta['page']:
|
| 67 |
+
current_file = meta['filename']
|
| 68 |
+
current_page = meta['page']
|
| 69 |
+
print(f"File: {current_file}, Page: {current_page}")
|
| 70 |
+
|
| 71 |
print(f"Successfully mapped {len(self.text_to_metadata)} text chunks to metadata")
|
| 72 |
+
|
| 73 |
+
# Sample a few mappings for verification
|
| 74 |
+
sample_size = min(3, len(self.text_to_metadata))
|
| 75 |
+
sample_items = list(self.text_to_metadata.items())[:sample_size]
|
| 76 |
+
for i, (text, meta) in enumerate(sample_items):
|
| 77 |
+
print(f"Sample {i+1}: {text[:50]}... -> {meta}")
|
| 78 |
else:
|
| 79 |
+
print(f"WARNING: Metadata mapping not created. Metadata: {len(metadata) if metadata else 0}, Texts: {len(texts) if texts else 0}")
|
| 80 |
|
| 81 |
async def arun_pipeline(self, user_query: str):
|
| 82 |
context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
|
| 83 |
+
# Debug: print the first retrieved context
|
| 84 |
+
if context_list:
|
| 85 |
+
print(f"Retrieved context: {context_list[0][0][:100]}...")
|
| 86 |
|
| 87 |
context_prompt = ""
|
| 88 |
sources = []
|
|
|
|
| 91 |
text = context[0]
|
| 92 |
context_prompt += text + "\n"
|
| 93 |
|
| 94 |
+
# Normalize the text for better matching
|
| 95 |
+
normalized_text = normalize_text(text)
|
| 96 |
+
|
| 97 |
+
# Get metadata for this text if available using normalized text
|
| 98 |
+
if normalized_text in self.text_to_metadata:
|
| 99 |
+
sources.append(self.text_to_metadata[normalized_text])
|
| 100 |
+
print(f"✓ Found exact metadata match for: {normalized_text[:50]}...")
|
| 101 |
else:
|
| 102 |
# If exact text not found, try finding most similar text
|
| 103 |
+
print(f"× No exact match for: {normalized_text[:50]}...")
|
| 104 |
+
|
| 105 |
found = False
|
| 106 |
+
best_match = None
|
| 107 |
+
best_score = 0
|
| 108 |
+
|
| 109 |
+
# Try fuzzy matching
|
| 110 |
for orig_text, meta in self.text_to_metadata.items():
|
| 111 |
+
# Calculate overlap score
|
| 112 |
+
text_words = set(normalized_text.split())
|
| 113 |
+
orig_words = set(orig_text.split())
|
| 114 |
+
|
| 115 |
+
if not text_words or not orig_words:
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
overlap = len(text_words.intersection(orig_words))
|
| 119 |
+
score = overlap / max(len(text_words), len(orig_words))
|
| 120 |
+
|
| 121 |
+
if score > best_score and score > 0.5: # Minimum 50% word overlap
|
| 122 |
+
best_score = score
|
| 123 |
+
best_match = meta
|
| 124 |
+
|
| 125 |
+
if best_match:
|
| 126 |
+
sources.append(best_match)
|
| 127 |
+
print(f"✓ Found fuzzy match with score {best_score:.2f}")
|
| 128 |
+
found = True
|
| 129 |
|
| 130 |
if not found:
|
| 131 |
+
print("× No match found at all")
|
| 132 |
sources.append({"filename": "unknown", "page": "unknown"})
|
| 133 |
|
| 134 |
formatted_system_prompt = system_role_prompt.create_message()
|
|
|
|
| 152 |
with open('data/preprocessed_data.pkl', 'rb') as f:
|
| 153 |
data = pickle.load(f)
|
| 154 |
|
| 155 |
+
# Debug info about the file contents
|
| 156 |
+
print(f"Loaded preprocessed data with keys: {list(data.keys())}")
|
| 157 |
+
|
| 158 |
# Create a new vector database
|
| 159 |
vector_db = VectorDatabase()
|
| 160 |
|
| 161 |
+
# Check that vectors dictionary has data
|
| 162 |
+
if 'vectors' in data and data['vectors']:
|
| 163 |
+
print(f"Vectors dictionary has {len(data['vectors'])} entries")
|
| 164 |
+
# Directly populate the vectors dictionary
|
| 165 |
+
for key, vector in data['vectors'].items():
|
| 166 |
+
vector_db.insert(key, vector)
|
| 167 |
+
else:
|
| 168 |
+
print("WARNING: No vectors found in preprocessed data")
|
| 169 |
|
| 170 |
# Get metadata and original texts if available
|
| 171 |
metadata = data.get('metadata', [])
|
| 172 |
texts = data.get('texts', [])
|
| 173 |
|
| 174 |
+
print(f"Loaded {len(metadata)} metadata entries and {len(texts)} texts")
|
| 175 |
+
|
| 176 |
+
# Verify a sample of metadata to debug page numbering
|
| 177 |
+
if metadata and len(metadata) > 0:
|
| 178 |
+
page_counts = {}
|
| 179 |
+
for meta in metadata:
|
| 180 |
+
filename = meta.get('filename', 'unknown')
|
| 181 |
+
page = meta.get('page', 'unknown')
|
| 182 |
+
if filename not in page_counts:
|
| 183 |
+
page_counts[filename] = set()
|
| 184 |
+
page_counts[filename].add(page)
|
| 185 |
+
|
| 186 |
+
print(f"Found {len(page_counts)} unique files with pages:")
|
| 187 |
+
for filename, pages in page_counts.items():
|
| 188 |
+
print(f" - {filename}: {len(pages)} unique pages (min: {min(pages)}, max: {max(pages)})")
|
| 189 |
+
|
| 190 |
return vector_db, metadata, texts
|
| 191 |
|
| 192 |
@cl.on_chat_start
|