Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

1ca91bc

1 Parent(s): 63ebb90

new dedublication

Browse files

Files changed (1) hide show

utils.py +28 -5

utils.py CHANGED Viewed

@@ -124,24 +124,47 @@ def generate_sources_html(nodes, chunks_df=None):
     return html
 def deduplicate_nodes(nodes):
-    """Deduplicate retrieved nodes based on unique identifiers"""
     seen = set()
     unique_nodes = []
     for node in nodes:
         doc_id = node.metadata.get('document_id', '')
-        section_id = node.metadata.get('section_id', '')
         node_type = node.metadata.get('type', 'text')
         if node_type == 'table' or node_type == 'table_row':
             table_num = node.metadata.get('table_number', '')
-            identifier = f"{doc_id}|table|{table_num}"
         elif node_type == 'image':
             img_num = node.metadata.get('image_number', '')
             identifier = f"{doc_id}|image|{img_num}"
-        else:
             chunk_id = node.metadata.get('chunk_id', 0)
-            identifier = f"{doc_id}|{section_id}|{chunk_id}"
         if identifier not in seen:
             seen.add(identifier)

     return html
 def deduplicate_nodes(nodes):
+    """Deduplicate retrieved nodes based on content and metadata"""
     seen = set()
     unique_nodes = []
     for node in nodes:
         doc_id = node.metadata.get('document_id', '')
         node_type = node.metadata.get('type', 'text')
         if node_type == 'table' or node_type == 'table_row':
             table_num = node.metadata.get('table_number', '')
+            table_identifier = node.metadata.get('table_identifier', table_num)
+            # Use row range to distinguish table chunks
+            row_start = node.metadata.get('row_start', '')
+            row_end = node.metadata.get('row_end', '')
+            is_complete = node.metadata.get('is_complete_table', False)
+            if is_complete:
+                identifier = f"{doc_id}|table|{table_identifier}|complete"
+            elif row_start != '' and row_end != '':
+                identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
+            else:
+                # Fallback: use chunk_id if available
+                chunk_id = node.metadata.get('chunk_id', '')
+                if chunk_id != '':
+                    identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
+                else:
+                    # Last resort: hash first 100 chars of content
+                    import hashlib
+                    content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
+                    identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
         elif node_type == 'image':
             img_num = node.metadata.get('image_number', '')
             identifier = f"{doc_id}|image|{img_num}"
+        else:  # text
+            section_id = node.metadata.get('section_id', '')
             chunk_id = node.metadata.get('chunk_id', 0)
+            # For text, section_id + chunk_id should be unique
+            identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
         if identifier not in seen:
             seen.add(identifier)