Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

bb76787

1 Parent(s): 5514fbd

doc id problem fixed

Browse files

Files changed (1) hide show

index_retriever.py +12 -16

index_retriever.py CHANGED Viewed

@@ -31,20 +31,21 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
 def normalize_doc_id(doc_id: str) -> str:
-    """Normalize document ID for consistent comparison."""
     doc_id = doc_id.upper().strip()
-    doc_id = re.sub(r'[^\w\d\.]+', '', doc_id)  # remove spaces, dashes, etc.
     doc_id = doc_id.replace("ГОСТР", "ГОСТ")
     doc_id = doc_id.replace("GOSTR", "ГОСТ")
     return doc_id
 def base_number(doc_id: str) -> str:
-    """Extract base numeric pattern (e.g., '59023.4' from 'ГОСТ Р 59023.4-2020')."""
-    m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
     return m.group(1) if m else ""
-def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
-    """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
     if not doc_ids:
         return nodes
@@ -57,22 +58,17 @@ def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
         node_base = base_number(node_doc_id)
         for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
-            # Strong match: same base number (e.g., 59023.4)
             if q_base and node_base and q_base == node_base:
                 filtered.append(node)
                 break
-            # Medium match: similarity ratio > threshold
-            if SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
-                filtered.append(node)
-                break
-            # Weak fallback: contains or partial substring
-            if q_base in node_doc_id or q_doc in node_doc_id:
                 filtered.append(node)
                 break
-    return filtered if filtered else nodes  # Fallback: keep all if none matched
 def extract_doc_id_from_query(query):

 def normalize_doc_id(doc_id: str) -> str:
+    """Normalize document ID - KEEP dots for numeric parts"""
     doc_id = doc_id.upper().strip()
+    doc_id = re.sub(r'\s+', '', doc_id)  # Remove spaces only
     doc_id = doc_id.replace("ГОСТР", "ГОСТ")
     doc_id = doc_id.replace("GOSTR", "ГОСТ")
     return doc_id
 def base_number(doc_id: str) -> str:
+    """Extract full numeric pattern including all parts (e.g., '59023.6' from 'ГОСТ 59023.6')"""
+    # Match: 59023.6 or 59023.4 or 50.05.01 etc.
+    m = re.search(r'(\d+(?:\.\d+)*)', doc_id)
     return m.group(1) if m else ""
+def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
+    """Filter nodes by document ID with strict numeric matching"""
     if not doc_ids:
         return nodes
         node_base = base_number(node_doc_id)
         for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
+            # STRICT: base number must match exactly
             if q_base and node_base and q_base == node_base:
                 filtered.append(node)
                 break
+            # STRICT: full normalized ID must match exactly or have very high similarity
+            elif SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
                 filtered.append(node)
                 break
+    return filtered if filtered else nodes
 def extract_doc_id_from_query(query):