MrSimple07 commited on
Commit
399f589
1 Parent(s): d99512d

new doc id filter + 100 + 100 retrieval

Browse files
Files changed (1) hide show
  1. index_retriever.py +24 -23
index_retriever.py CHANGED
@@ -26,12 +26,11 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
26
  import re
27
 
28
  def extract_doc_id_from_query(query):
29
- """Extract document IDs from query text"""
30
- # Match patterns like: 袚袨小孝 59023.2, 袧袩-104, 袚袨小孝 袪 50.04.07-2022
31
  patterns = [
32
- r'(?:袚袨小孝\s*袪?\s*)[\d\.]+(?:-\d{4})?', # 袚袨小孝 patterns
33
- r'袧袩-\d+(?:-\d+)?', # 袧袩 patterns
34
- r'袦校[_\s][\d\.]+', # 袦校 patterns
35
  ]
36
 
37
  found_ids = []
@@ -39,33 +38,35 @@ def extract_doc_id_from_query(query):
39
  matches = re.findall(pattern, query, re.IGNORECASE)
40
  found_ids.extend(matches)
41
 
42
- # Normalize spacing
43
- normalized = [re.sub(r'\s+', ' ', id.strip()) for id in found_ids]
44
  return normalized
45
 
46
-
47
- def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.8):
48
- """Keep nodes that match any of the document IDs"""
49
  if not doc_ids:
50
  return nodes
51
 
52
- from difflib import SequenceMatcher
53
-
54
  filtered = []
55
  for node in nodes:
56
- node_doc_id = node.metadata.get('document_id', '').upper()
 
57
 
58
  for query_doc_id in doc_ids:
59
- query_doc_id = query_doc_id.upper()
60
 
61
- # Exact substring match
62
- if query_doc_id in node_doc_id or node_doc_id in query_doc_id:
63
- filtered.append(node)
64
- break
 
 
 
 
 
65
 
66
- # Fuzzy match for close variants
67
- similarity = SequenceMatcher(None, query_doc_id, node_doc_id).ratio()
68
- if similarity >= threshold:
69
  filtered.append(node)
70
  break
71
 
@@ -92,11 +93,11 @@ def create_query_engine(vector_index):
92
 
93
  vector_retriever = VectorIndexRetriever(
94
  index=vector_index,
95
- similarity_top_k=50
96
  )
97
  bm25_retriever = BM25Retriever.from_defaults(
98
  docstore=vector_index.docstore,
99
- similarity_top_k=50,
100
  tokenizer=russian_tokenizer # Add custom tokenizer
101
 
102
  )
 
26
  import re
27
 
28
  def extract_doc_id_from_query(query):
29
+ """Extract document IDs from query text with better pattern matching"""
 
30
  patterns = [
31
+ r'袚袨小孝\s*袪?\s*\d+(?:\.\d+)*(?:-\d{4})?', # 袚袨小孝 59023.4, 袚袨小孝 袪 50.05.01-2018
32
+ r'袧袩-\d+(?:-\d+)?', # 袧袩-104-18
33
+ r'袦校[_\s]\d+(?:\.\d+)+(?:\.\d+)*(?:-\d{4})?', # 袦校 1.2.3.07.0057-2018
34
  ]
35
 
36
  found_ids = []
 
38
  matches = re.findall(pattern, query, re.IGNORECASE)
39
  found_ids.extend(matches)
40
 
41
+ # Normalize spacing and preserve dots
42
+ normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
43
  return normalized
44
 
45
+ def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
46
+ """Keep nodes that match any of the document IDs with better matching"""
 
47
  if not doc_ids:
48
  return nodes
49
 
 
 
50
  filtered = []
51
  for node in nodes:
52
+ node_doc_id = node.metadata.get('document_id', '').upper().strip()
53
+ node_doc_id_normalized = re.sub(r'\s+', ' ', node_doc_id)
54
 
55
  for query_doc_id in doc_ids:
56
+ query_doc_id = query_doc_id.upper().strip()
57
 
58
+ # Extract base number for comparison (e.g., "59023.4" from "袚袨小孝 袪 59023.4-2020")
59
+ node_base = re.search(r'(\d+(?:\.\d+)+)', node_doc_id_normalized)
60
+ query_base = re.search(r'(\d+(?:\.\d+)+)', query_doc_id)
61
+
62
+ # Match if base numbers are identical
63
+ if node_base and query_base:
64
+ if node_base.group(1) == query_base.group(1):
65
+ filtered.append(node)
66
+ break
67
 
68
+ # Fallback: exact substring match
69
+ if query_doc_id in node_doc_id_normalized or node_doc_id_normalized in query_doc_id:
 
70
  filtered.append(node)
71
  break
72
 
 
93
 
94
  vector_retriever = VectorIndexRetriever(
95
  index=vector_index,
96
+ similarity_top_k=100
97
  )
98
  bm25_retriever = BM25Retriever.from_defaults(
99
  docstore=vector_index.docstore,
100
+ similarity_top_k=100,
101
  tokenizer=russian_tokenizer # Add custom tokenizer
102
 
103
  )