Spaces:
Sleeping
Sleeping
Commit
路
399f589
1
Parent(s):
d99512d
new doc id filter + 100 + 100 retrieval
Browse files- index_retriever.py +24 -23
index_retriever.py
CHANGED
|
@@ -26,12 +26,11 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
|
|
| 26 |
import re
|
| 27 |
|
| 28 |
def extract_doc_id_from_query(query):
|
| 29 |
-
"""Extract document IDs from query text"""
|
| 30 |
-
# Match patterns like: 袚袨小孝 59023.2, 袧袩-104, 袚袨小孝 袪 50.04.07-2022
|
| 31 |
patterns = [
|
| 32 |
-
r'
|
| 33 |
-
r'袧袩-\d+(?:-\d+)?',
|
| 34 |
-
r'袦校[_\s]
|
| 35 |
]
|
| 36 |
|
| 37 |
found_ids = []
|
|
@@ -39,33 +38,35 @@ def extract_doc_id_from_query(query):
|
|
| 39 |
matches = re.findall(pattern, query, re.IGNORECASE)
|
| 40 |
found_ids.extend(matches)
|
| 41 |
|
| 42 |
-
# Normalize spacing
|
| 43 |
-
normalized = [re.sub(r'\s+', ' ', id.strip()) for id in found_ids]
|
| 44 |
return normalized
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
"""Keep nodes that match any of the document IDs"""
|
| 49 |
if not doc_ids:
|
| 50 |
return nodes
|
| 51 |
|
| 52 |
-
from difflib import SequenceMatcher
|
| 53 |
-
|
| 54 |
filtered = []
|
| 55 |
for node in nodes:
|
| 56 |
-
node_doc_id = node.metadata.get('document_id', '').upper()
|
|
|
|
| 57 |
|
| 58 |
for query_doc_id in doc_ids:
|
| 59 |
-
query_doc_id = query_doc_id.upper()
|
| 60 |
|
| 61 |
-
#
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
if similarity >= threshold:
|
| 69 |
filtered.append(node)
|
| 70 |
break
|
| 71 |
|
|
@@ -92,11 +93,11 @@ def create_query_engine(vector_index):
|
|
| 92 |
|
| 93 |
vector_retriever = VectorIndexRetriever(
|
| 94 |
index=vector_index,
|
| 95 |
-
similarity_top_k=
|
| 96 |
)
|
| 97 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 98 |
docstore=vector_index.docstore,
|
| 99 |
-
similarity_top_k=
|
| 100 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 101 |
|
| 102 |
)
|
|
|
|
| 26 |
import re
|
| 27 |
|
| 28 |
def extract_doc_id_from_query(query):
|
| 29 |
+
"""Extract document IDs from query text with better pattern matching"""
|
|
|
|
| 30 |
patterns = [
|
| 31 |
+
r'袚袨小孝\s*袪?\s*\d+(?:\.\d+)*(?:-\d{4})?', # 袚袨小孝 59023.4, 袚袨小孝 袪 50.05.01-2018
|
| 32 |
+
r'袧袩-\d+(?:-\d+)?', # 袧袩-104-18
|
| 33 |
+
r'袦校[_\s]\d+(?:\.\d+)+(?:\.\d+)*(?:-\d{4})?', # 袦校 1.2.3.07.0057-2018
|
| 34 |
]
|
| 35 |
|
| 36 |
found_ids = []
|
|
|
|
| 38 |
matches = re.findall(pattern, query, re.IGNORECASE)
|
| 39 |
found_ids.extend(matches)
|
| 40 |
|
| 41 |
+
# Normalize spacing and preserve dots
|
| 42 |
+
normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
|
| 43 |
return normalized
|
| 44 |
|
| 45 |
+
def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
|
| 46 |
+
"""Keep nodes that match any of the document IDs with better matching"""
|
|
|
|
| 47 |
if not doc_ids:
|
| 48 |
return nodes
|
| 49 |
|
|
|
|
|
|
|
| 50 |
filtered = []
|
| 51 |
for node in nodes:
|
| 52 |
+
node_doc_id = node.metadata.get('document_id', '').upper().strip()
|
| 53 |
+
node_doc_id_normalized = re.sub(r'\s+', ' ', node_doc_id)
|
| 54 |
|
| 55 |
for query_doc_id in doc_ids:
|
| 56 |
+
query_doc_id = query_doc_id.upper().strip()
|
| 57 |
|
| 58 |
+
# Extract base number for comparison (e.g., "59023.4" from "袚袨小孝 袪 59023.4-2020")
|
| 59 |
+
node_base = re.search(r'(\d+(?:\.\d+)+)', node_doc_id_normalized)
|
| 60 |
+
query_base = re.search(r'(\d+(?:\.\d+)+)', query_doc_id)
|
| 61 |
+
|
| 62 |
+
# Match if base numbers are identical
|
| 63 |
+
if node_base and query_base:
|
| 64 |
+
if node_base.group(1) == query_base.group(1):
|
| 65 |
+
filtered.append(node)
|
| 66 |
+
break
|
| 67 |
|
| 68 |
+
# Fallback: exact substring match
|
| 69 |
+
if query_doc_id in node_doc_id_normalized or node_doc_id_normalized in query_doc_id:
|
|
|
|
| 70 |
filtered.append(node)
|
| 71 |
break
|
| 72 |
|
|
|
|
| 93 |
|
| 94 |
vector_retriever = VectorIndexRetriever(
|
| 95 |
index=vector_index,
|
| 96 |
+
similarity_top_k=100
|
| 97 |
)
|
| 98 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 99 |
docstore=vector_index.docstore,
|
| 100 |
+
similarity_top_k=100,
|
| 101 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 102 |
|
| 103 |
)
|