Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

5514fbd

1 Parent(s): 399f589

doc id problem fixed

Browse files

Files changed (1) hide show

index_retriever.py +56 -42

index_retriever.py CHANGED Viewed

@@ -6,6 +6,12 @@ from llama_index.core.retrievers import QueryFusionRetriever
 from llama_index.core.response_synthesizers import get_response_synthesizer
 from my_logging import log_message
 def create_vector_index(documents):
     """Create vector index from documents"""
     log_message(f"Building vector index from {len(documents)} documents...")
@@ -23,7 +29,51 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
             filtered.append(node)
     return filtered
-import re
 def extract_doc_id_from_query(query):
     """Extract document IDs from query text with better pattern matching"""
@@ -41,37 +91,6 @@ def extract_doc_id_from_query(query):
     # Normalize spacing and preserve dots
     normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
     return normalized
-def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
-    """Keep nodes that match any of the document IDs with better matching"""
-    if not doc_ids:
-        return nodes
-    filtered = []
-    for node in nodes:
-        node_doc_id = node.metadata.get('document_id', '').upper().strip()
-        node_doc_id_normalized = re.sub(r'\s+', ' ', node_doc_id)
-        for query_doc_id in doc_ids:
-            query_doc_id = query_doc_id.upper().strip()
-            # Extract base number for comparison (e.g., "59023.4" from "ГОСТ Р 59023.4-2020")
-            node_base = re.search(r'(\d+(?:\.\d+)+)', node_doc_id_normalized)
-            query_base = re.search(r'(\d+(?:\.\d+)+)', query_doc_id)
-            # Match if base numbers are identical
-            if node_base and query_base:
-                if node_base.group(1) == query_base.group(1):
-                    filtered.append(node)
-                    break
-            # Fallback: exact substring match
-            if query_doc_id in node_doc_id_normalized or node_doc_id_normalized in query_doc_id:
-                filtered.append(node)
-                break
-    return filtered
 def russian_tokenizer(text):
     """Better tokenizer for Russian document IDs and technical terms"""
     import re
@@ -116,16 +135,11 @@ def create_query_engine(vector_index):
             doc_ids = extract_doc_id_from_query(query)
             if doc_ids:
                 log_message(f"Detected document IDs in query: {doc_ids}")
-                # Filter by document ID
-                doc_filtered = filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.7)
-                log_message(f"After doc ID filter: {len(doc_filtered)} nodes")
-                # If we found matching documents, use only those
-                if doc_filtered:
-                    nodes = doc_filtered
-                else:
-                    log_message("WARNING: No nodes matched document IDs, using all results")
             # Deduplication
             seen_hashes = set()

 from llama_index.core.response_synthesizers import get_response_synthesizer
 from my_logging import log_message
+import re
+import re
+from difflib import SequenceMatcher
 def create_vector_index(documents):
     """Create vector index from documents"""
     log_message(f"Building vector index from {len(documents)} documents...")
             filtered.append(node)
     return filtered
+def normalize_doc_id(doc_id: str) -> str:
+    """Normalize document ID for consistent comparison."""
+    doc_id = doc_id.upper().strip()
+    doc_id = re.sub(r'[^\w\d\.]+', '', doc_id)  # remove spaces, dashes, etc.
+    doc_id = doc_id.replace("ГОСТР", "ГОСТ")
+    doc_id = doc_id.replace("GOSTR", "ГОСТ")
+    return doc_id
+def base_number(doc_id: str) -> str:
+    """Extract base numeric pattern (e.g., '59023.4' from 'ГОСТ Р 59023.4-2020')."""
+    m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
+    return m.group(1) if m else ""
+def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
+    """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
+    if not doc_ids:
+        return nodes
+    filtered = []
+    doc_ids_norm = [normalize_doc_id(d) for d in doc_ids]
+    doc_ids_base = [base_number(d) for d in doc_ids_norm]
+    for node in nodes:
+        node_doc_id = normalize_doc_id(node.metadata.get('document_id', ''))
+        node_base = base_number(node_doc_id)
+        for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
+            # Strong match: same base number (e.g., 59023.4)
+            if q_base and node_base and q_base == node_base:
+                filtered.append(node)
+                break
+            # Medium match: similarity ratio > threshold
+            if SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
+                filtered.append(node)
+                break
+            # Weak fallback: contains or partial substring
+            if q_base in node_doc_id or q_doc in node_doc_id:
+                filtered.append(node)
+                break
+    return filtered if filtered else nodes  # Fallback: keep all if none matched
 def extract_doc_id_from_query(query):
     """Extract document IDs from query text with better pattern matching"""
     # Normalize spacing and preserve dots
     normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
     return normalized
 def russian_tokenizer(text):
     """Better tokenizer for Russian document IDs and technical terms"""
     import re
             doc_ids = extract_doc_id_from_query(query)
             if doc_ids:
                 log_message(f"Detected document IDs in query: {doc_ids}")
+                before = len(nodes)
+                nodes = filter_nodes_by_doc_id(nodes, doc_ids)
+                after = len(nodes)
+                log_message(f"Filtered by doc ID: {after}/{before} nodes kept (fallback safe)")
             # Deduplication
             seen_hashes = set()