MrSimple07 commited on
Commit
a83db61
·
1 Parent(s): 806f3f9

Much lower reranking threshold (-0.5 instead of 0.1) + detailed score logging

Browse files
Files changed (2) hide show
  1. documents_prep.py +6 -1
  2. index_retriever.py +89 -7
documents_prep.py CHANGED
@@ -18,12 +18,17 @@ def chunk_text_documents(documents):
18
 
19
  chunked = []
20
  for doc in documents:
 
 
 
 
 
21
  chunks = text_splitter.get_nodes_from_documents([doc])
22
  for i, chunk in enumerate(chunks):
23
  chunk.metadata.update({
24
  'chunk_id': i,
25
  'total_chunks': len(chunks),
26
- 'chunk_size': len(chunk.text) # Add chunk size
27
  })
28
  chunked.append(chunk)
29
 
 
18
 
19
  chunked = []
20
  for doc in documents:
21
+ # Add document ID to text for better BM25 matching
22
+ doc_id = doc.metadata.get('document_id', '')
23
+ if doc_id and doc_id not in doc.text[:200]:
24
+ doc.text = f"[Документ: {doc_id}]\n\n{doc.text}"
25
+
26
  chunks = text_splitter.get_nodes_from_documents([doc])
27
  for i, chunk in enumerate(chunks):
28
  chunk.metadata.update({
29
  'chunk_id': i,
30
  'total_chunks': len(chunks),
31
+ 'chunk_size': len(chunk.text)
32
  })
33
  chunked.append(chunk)
34
 
index_retriever.py CHANGED
@@ -23,21 +23,86 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
23
  filtered.append(node)
24
  return filtered
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def create_query_engine(vector_index):
27
- """Create hybrid retrieval engine with better deduplication"""
28
  log_message("Creating query engine...")
29
 
30
  vector_retriever = VectorIndexRetriever(
31
  index=vector_index,
32
- similarity_top_k=50 # Reduced to get more diverse results
33
  )
34
  bm25_retriever = BM25Retriever.from_defaults(
35
  docstore=vector_index.docstore,
36
  similarity_top_k=50,
 
 
37
  )
38
  hybrid_retriever = QueryFusionRetriever(
39
  [vector_retriever, bm25_retriever],
40
- similarity_top_k=60, # Reduced
41
  num_queries=1
42
  )
43
 
@@ -46,20 +111,33 @@ def create_query_engine(vector_index):
46
  nodes = hybrid_retriever.retrieve(query)
47
  log_message(f"Hybrid retrieval returned: {len(nodes)} nodes")
48
 
49
- # Better deduplication using longer text snippet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  seen_hashes = set()
51
  unique_nodes = []
52
  doc_type_counts = {'text': 0, 'table': 0, 'image': 0}
53
 
54
  for node in nodes:
55
- # Use first 500 chars for dedup hash
56
  text_hash = hash(node.text[:500])
57
 
58
  if text_hash not in seen_hashes:
59
  seen_hashes.add(text_hash)
60
  unique_nodes.append(node)
61
 
62
- # Count by type
63
  node_type = node.metadata.get('type', 'text')
64
  doc_type_counts[node_type] = doc_type_counts.get(node_type, 0) + 1
65
 
@@ -68,6 +146,10 @@ def create_query_engine(vector_index):
68
  f"table={doc_type_counts.get('table', 0)}, "
69
  f"image={doc_type_counts.get('image', 0)}")
70
 
 
 
 
 
71
  return unique_nodes[:50]
72
 
73
  response_synthesizer = get_response_synthesizer()
@@ -77,5 +159,5 @@ def create_query_engine(vector_index):
77
  response_synthesizer=response_synthesizer
78
  )
79
 
80
- log_message("✓ Query engine created")
81
  return query_engine
 
23
  filtered.append(node)
24
  return filtered
25
 
26
+ import re
27
+
28
+ def extract_doc_id_from_query(query):
29
+ """Extract document IDs from query text"""
30
+ # Match patterns like: ГОСТ 59023.2, НП-104, ГОСТ Р 50.04.07-2022
31
+ patterns = [
32
+ r'(?:ГОСТ\s*Р?\s*)[\d\.]+(?:-\d{4})?', # ГОСТ patterns
33
+ r'НП-\d+(?:-\d+)?', # НП patterns
34
+ r'МУ[_\s][\d\.]+', # МУ patterns
35
+ ]
36
+
37
+ found_ids = []
38
+ for pattern in patterns:
39
+ matches = re.findall(pattern, query, re.IGNORECASE)
40
+ found_ids.extend(matches)
41
+
42
+ # Normalize spacing
43
+ normalized = [re.sub(r'\s+', ' ', id.strip()) for id in found_ids]
44
+ return normalized
45
+
46
+
47
+ def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.8):
48
+ """Keep nodes that match any of the document IDs"""
49
+ if not doc_ids:
50
+ return nodes
51
+
52
+ from difflib import SequenceMatcher
53
+
54
+ filtered = []
55
+ for node in nodes:
56
+ node_doc_id = node.metadata.get('document_id', '').upper()
57
+
58
+ for query_doc_id in doc_ids:
59
+ query_doc_id = query_doc_id.upper()
60
+
61
+ # Exact substring match
62
+ if query_doc_id in node_doc_id or node_doc_id in query_doc_id:
63
+ filtered.append(node)
64
+ break
65
+
66
+ # Fuzzy match for close variants
67
+ similarity = SequenceMatcher(None, query_doc_id, node_doc_id).ratio()
68
+ if similarity >= threshold:
69
+ filtered.append(node)
70
+ break
71
+
72
+ return filtered
73
+
74
+ def russian_tokenizer(text):
75
+ """Better tokenizer for Russian document IDs and technical terms"""
76
+ import re
77
+
78
+ # Keep document ID patterns intact
79
+ text = re.sub(r'(ГОСТ\s*Р?\s*[\d\.]+(?:-\d{4})?)', r' \1 ', text)
80
+ text = re.sub(r'(НП-\d+(?:-\d+)?)', r' \1 ', text)
81
+ text = re.sub(r'(МУ[_\s][\d\.]+)', r' \1 ', text)
82
+
83
+ # Split on whitespace and punctuation, but keep numbers with decimals
84
+ tokens = re.findall(r'\d+\.\d+|\w+', text.lower())
85
+
86
+ return tokens
87
+
88
+
89
  def create_query_engine(vector_index):
90
+ """Create hybrid retrieval engine with document ID filtering"""
91
  log_message("Creating query engine...")
92
 
93
  vector_retriever = VectorIndexRetriever(
94
  index=vector_index,
95
+ similarity_top_k=50
96
  )
97
  bm25_retriever = BM25Retriever.from_defaults(
98
  docstore=vector_index.docstore,
99
  similarity_top_k=50,
100
+ tokenizer=russian_tokenizer # Add custom tokenizer
101
+
102
  )
103
  hybrid_retriever = QueryFusionRetriever(
104
  [vector_retriever, bm25_retriever],
105
+ similarity_top_k=60,
106
  num_queries=1
107
  )
108
 
 
111
  nodes = hybrid_retriever.retrieve(query)
112
  log_message(f"Hybrid retrieval returned: {len(nodes)} nodes")
113
 
114
+ # Extract document IDs from query
115
+ doc_ids = extract_doc_id_from_query(query)
116
+ if doc_ids:
117
+ log_message(f"Detected document IDs in query: {doc_ids}")
118
+
119
+ # Filter by document ID
120
+ doc_filtered = filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.7)
121
+ log_message(f"After doc ID filter: {len(doc_filtered)} nodes")
122
+
123
+ # If we found matching documents, use only those
124
+ if doc_filtered:
125
+ nodes = doc_filtered
126
+ else:
127
+ log_message("WARNING: No nodes matched document IDs, using all results")
128
+
129
+ # Deduplication
130
  seen_hashes = set()
131
  unique_nodes = []
132
  doc_type_counts = {'text': 0, 'table': 0, 'image': 0}
133
 
134
  for node in nodes:
 
135
  text_hash = hash(node.text[:500])
136
 
137
  if text_hash not in seen_hashes:
138
  seen_hashes.add(text_hash)
139
  unique_nodes.append(node)
140
 
 
141
  node_type = node.metadata.get('type', 'text')
142
  doc_type_counts[node_type] = doc_type_counts.get(node_type, 0) + 1
143
 
 
146
  f"table={doc_type_counts.get('table', 0)}, "
147
  f"image={doc_type_counts.get('image', 0)}")
148
 
149
+ # Log which documents we're returning
150
+ returned_docs = set(n.metadata.get('document_id', 'unknown') for n in unique_nodes[:50])
151
+ log_message(f"Returning nodes from: {sorted(returned_docs)}")
152
+
153
  return unique_nodes[:50]
154
 
155
  response_synthesizer = get_response_synthesizer()
 
159
  response_synthesizer=response_synthesizer
160
  )
161
 
162
+ log_message("✓ Query engine created with doc ID filtering")
163
  return query_engine