MrSimple07 commited on
Commit
5514fbd
1 Parent(s): 399f589

doc id problem fixed

Browse files
Files changed (1) hide show
  1. index_retriever.py +56 -42
index_retriever.py CHANGED
@@ -6,6 +6,12 @@ from llama_index.core.retrievers import QueryFusionRetriever
6
  from llama_index.core.response_synthesizers import get_response_synthesizer
7
  from my_logging import log_message
8
 
 
 
 
 
 
 
9
  def create_vector_index(documents):
10
  """Create vector index from documents"""
11
  log_message(f"Building vector index from {len(documents)} documents...")
@@ -23,7 +29,51 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
23
  filtered.append(node)
24
  return filtered
25
 
26
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def extract_doc_id_from_query(query):
29
  """Extract document IDs from query text with better pattern matching"""
@@ -41,37 +91,6 @@ def extract_doc_id_from_query(query):
41
  # Normalize spacing and preserve dots
42
  normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
43
  return normalized
44
-
45
- def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
46
- """Keep nodes that match any of the document IDs with better matching"""
47
- if not doc_ids:
48
- return nodes
49
-
50
- filtered = []
51
- for node in nodes:
52
- node_doc_id = node.metadata.get('document_id', '').upper().strip()
53
- node_doc_id_normalized = re.sub(r'\s+', ' ', node_doc_id)
54
-
55
- for query_doc_id in doc_ids:
56
- query_doc_id = query_doc_id.upper().strip()
57
-
58
- # Extract base number for comparison (e.g., "59023.4" from "袚袨小孝 袪 59023.4-2020")
59
- node_base = re.search(r'(\d+(?:\.\d+)+)', node_doc_id_normalized)
60
- query_base = re.search(r'(\d+(?:\.\d+)+)', query_doc_id)
61
-
62
- # Match if base numbers are identical
63
- if node_base and query_base:
64
- if node_base.group(1) == query_base.group(1):
65
- filtered.append(node)
66
- break
67
-
68
- # Fallback: exact substring match
69
- if query_doc_id in node_doc_id_normalized or node_doc_id_normalized in query_doc_id:
70
- filtered.append(node)
71
- break
72
-
73
- return filtered
74
-
75
  def russian_tokenizer(text):
76
  """Better tokenizer for Russian document IDs and technical terms"""
77
  import re
@@ -116,16 +135,11 @@ def create_query_engine(vector_index):
116
  doc_ids = extract_doc_id_from_query(query)
117
  if doc_ids:
118
  log_message(f"Detected document IDs in query: {doc_ids}")
119
-
120
- # Filter by document ID
121
- doc_filtered = filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.7)
122
- log_message(f"After doc ID filter: {len(doc_filtered)} nodes")
123
-
124
- # If we found matching documents, use only those
125
- if doc_filtered:
126
- nodes = doc_filtered
127
- else:
128
- log_message("WARNING: No nodes matched document IDs, using all results")
129
 
130
  # Deduplication
131
  seen_hashes = set()
 
6
  from llama_index.core.response_synthesizers import get_response_synthesizer
7
  from my_logging import log_message
8
 
9
+ import re
10
+
11
+ import re
12
+ from difflib import SequenceMatcher
13
+
14
+
15
  def create_vector_index(documents):
16
  """Create vector index from documents"""
17
  log_message(f"Building vector index from {len(documents)} documents...")
 
29
  filtered.append(node)
30
  return filtered
31
 
32
+
33
+ def normalize_doc_id(doc_id: str) -> str:
34
+ """Normalize document ID for consistent comparison."""
35
+ doc_id = doc_id.upper().strip()
36
+ doc_id = re.sub(r'[^\w\d\.]+', '', doc_id) # remove spaces, dashes, etc.
37
+ doc_id = doc_id.replace("袚袨小孝袪", "袚袨小孝")
38
+ doc_id = doc_id.replace("GOSTR", "袚袨小孝")
39
+ return doc_id
40
+
41
+ def base_number(doc_id: str) -> str:
42
+ """Extract base numeric pattern (e.g., '59023.4' from '袚袨小孝 袪 59023.4-2020')."""
43
+ m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
44
+ return m.group(1) if m else ""
45
+
46
+ def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
47
+ """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
48
+ if not doc_ids:
49
+ return nodes
50
+
51
+ filtered = []
52
+ doc_ids_norm = [normalize_doc_id(d) for d in doc_ids]
53
+ doc_ids_base = [base_number(d) for d in doc_ids_norm]
54
+
55
+ for node in nodes:
56
+ node_doc_id = normalize_doc_id(node.metadata.get('document_id', ''))
57
+ node_base = base_number(node_doc_id)
58
+
59
+ for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
60
+ # Strong match: same base number (e.g., 59023.4)
61
+ if q_base and node_base and q_base == node_base:
62
+ filtered.append(node)
63
+ break
64
+
65
+ # Medium match: similarity ratio > threshold
66
+ if SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
67
+ filtered.append(node)
68
+ break
69
+
70
+ # Weak fallback: contains or partial substring
71
+ if q_base in node_doc_id or q_doc in node_doc_id:
72
+ filtered.append(node)
73
+ break
74
+
75
+ return filtered if filtered else nodes # Fallback: keep all if none matched
76
+
77
 
78
  def extract_doc_id_from_query(query):
79
  """Extract document IDs from query text with better pattern matching"""
 
91
  # Normalize spacing and preserve dots
92
  normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
93
  return normalized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def russian_tokenizer(text):
95
  """Better tokenizer for Russian document IDs and technical terms"""
96
  import re
 
135
  doc_ids = extract_doc_id_from_query(query)
136
  if doc_ids:
137
  log_message(f"Detected document IDs in query: {doc_ids}")
138
+ before = len(nodes)
139
+ nodes = filter_nodes_by_doc_id(nodes, doc_ids)
140
+ after = len(nodes)
141
+ log_message(f"Filtered by doc ID: {after}/{before} nodes kept (fallback safe)")
142
+
 
 
 
 
 
143
 
144
  # Deduplication
145
  seen_hashes = set()