Spaces:
Sleeping
Sleeping
Commit
·
1ca91bc
1
Parent(s):
63ebb90
new dedublication
Browse files
utils.py
CHANGED
|
@@ -124,24 +124,47 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 124 |
return html
|
| 125 |
|
| 126 |
def deduplicate_nodes(nodes):
|
| 127 |
-
"""Deduplicate retrieved nodes based on
|
| 128 |
seen = set()
|
| 129 |
unique_nodes = []
|
| 130 |
|
| 131 |
for node in nodes:
|
| 132 |
doc_id = node.metadata.get('document_id', '')
|
| 133 |
-
section_id = node.metadata.get('section_id', '')
|
| 134 |
node_type = node.metadata.get('type', 'text')
|
| 135 |
|
| 136 |
if node_type == 'table' or node_type == 'table_row':
|
| 137 |
table_num = node.metadata.get('table_number', '')
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
elif node_type == 'image':
|
| 140 |
img_num = node.metadata.get('image_number', '')
|
| 141 |
identifier = f"{doc_id}|image|{img_num}"
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
chunk_id = node.metadata.get('chunk_id', 0)
|
| 144 |
-
|
|
|
|
| 145 |
|
| 146 |
if identifier not in seen:
|
| 147 |
seen.add(identifier)
|
|
|
|
| 124 |
return html
|
| 125 |
|
| 126 |
def deduplicate_nodes(nodes):
|
| 127 |
+
"""Deduplicate retrieved nodes based on content and metadata"""
|
| 128 |
seen = set()
|
| 129 |
unique_nodes = []
|
| 130 |
|
| 131 |
for node in nodes:
|
| 132 |
doc_id = node.metadata.get('document_id', '')
|
|
|
|
| 133 |
node_type = node.metadata.get('type', 'text')
|
| 134 |
|
| 135 |
if node_type == 'table' or node_type == 'table_row':
|
| 136 |
table_num = node.metadata.get('table_number', '')
|
| 137 |
+
table_identifier = node.metadata.get('table_identifier', table_num)
|
| 138 |
+
|
| 139 |
+
# Use row range to distinguish table chunks
|
| 140 |
+
row_start = node.metadata.get('row_start', '')
|
| 141 |
+
row_end = node.metadata.get('row_end', '')
|
| 142 |
+
is_complete = node.metadata.get('is_complete_table', False)
|
| 143 |
+
|
| 144 |
+
if is_complete:
|
| 145 |
+
identifier = f"{doc_id}|table|{table_identifier}|complete"
|
| 146 |
+
elif row_start != '' and row_end != '':
|
| 147 |
+
identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
|
| 148 |
+
else:
|
| 149 |
+
# Fallback: use chunk_id if available
|
| 150 |
+
chunk_id = node.metadata.get('chunk_id', '')
|
| 151 |
+
if chunk_id != '':
|
| 152 |
+
identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
|
| 153 |
+
else:
|
| 154 |
+
# Last resort: hash first 100 chars of content
|
| 155 |
+
import hashlib
|
| 156 |
+
content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
|
| 157 |
+
identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
|
| 158 |
+
|
| 159 |
elif node_type == 'image':
|
| 160 |
img_num = node.metadata.get('image_number', '')
|
| 161 |
identifier = f"{doc_id}|image|{img_num}"
|
| 162 |
+
|
| 163 |
+
else: # text
|
| 164 |
+
section_id = node.metadata.get('section_id', '')
|
| 165 |
chunk_id = node.metadata.get('chunk_id', 0)
|
| 166 |
+
# For text, section_id + chunk_id should be unique
|
| 167 |
+
identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
|
| 168 |
|
| 169 |
if identifier not in seen:
|
| 170 |
seen.add(identifier)
|