MrSimple07 commited on
Commit
1ca91bc
·
1 Parent(s): 63ebb90

new dedublication

Browse files
Files changed (1) hide show
  1. utils.py +28 -5
utils.py CHANGED
@@ -124,24 +124,47 @@ def generate_sources_html(nodes, chunks_df=None):
124
  return html
125
 
126
  def deduplicate_nodes(nodes):
127
- """Deduplicate retrieved nodes based on unique identifiers"""
128
  seen = set()
129
  unique_nodes = []
130
 
131
  for node in nodes:
132
  doc_id = node.metadata.get('document_id', '')
133
- section_id = node.metadata.get('section_id', '')
134
  node_type = node.metadata.get('type', 'text')
135
 
136
  if node_type == 'table' or node_type == 'table_row':
137
  table_num = node.metadata.get('table_number', '')
138
- identifier = f"{doc_id}|table|{table_num}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  elif node_type == 'image':
140
  img_num = node.metadata.get('image_number', '')
141
  identifier = f"{doc_id}|image|{img_num}"
142
- else:
 
 
143
  chunk_id = node.metadata.get('chunk_id', 0)
144
- identifier = f"{doc_id}|{section_id}|{chunk_id}"
 
145
 
146
  if identifier not in seen:
147
  seen.add(identifier)
 
124
  return html
125
 
126
  def deduplicate_nodes(nodes):
127
+ """Deduplicate retrieved nodes based on content and metadata"""
128
  seen = set()
129
  unique_nodes = []
130
 
131
  for node in nodes:
132
  doc_id = node.metadata.get('document_id', '')
 
133
  node_type = node.metadata.get('type', 'text')
134
 
135
  if node_type == 'table' or node_type == 'table_row':
136
  table_num = node.metadata.get('table_number', '')
137
+ table_identifier = node.metadata.get('table_identifier', table_num)
138
+
139
+ # Use row range to distinguish table chunks
140
+ row_start = node.metadata.get('row_start', '')
141
+ row_end = node.metadata.get('row_end', '')
142
+ is_complete = node.metadata.get('is_complete_table', False)
143
+
144
+ if is_complete:
145
+ identifier = f"{doc_id}|table|{table_identifier}|complete"
146
+ elif row_start != '' and row_end != '':
147
+ identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
148
+ else:
149
+ # Fallback: use chunk_id if available
150
+ chunk_id = node.metadata.get('chunk_id', '')
151
+ if chunk_id != '':
152
+ identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
153
+ else:
154
+ # Last resort: hash first 100 chars of content
155
+ import hashlib
156
+ content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
157
+ identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
158
+
159
  elif node_type == 'image':
160
  img_num = node.metadata.get('image_number', '')
161
  identifier = f"{doc_id}|image|{img_num}"
162
+
163
+ else: # text
164
+ section_id = node.metadata.get('section_id', '')
165
  chunk_id = node.metadata.get('chunk_id', 0)
166
+ # For text, section_id + chunk_id should be unique
167
+ identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
168
 
169
  if identifier not in seen:
170
  seen.add(identifier)