Spaces:
Sleeping
Sleeping
Commit
·
a83db61
1
Parent(s):
806f3f9
Much lower reranking threshold (-0.5 instead of 0.1) + detailed score logging
Browse files- documents_prep.py +6 -1
- index_retriever.py +89 -7
documents_prep.py
CHANGED
|
@@ -18,12 +18,17 @@ def chunk_text_documents(documents):
|
|
| 18 |
|
| 19 |
chunked = []
|
| 20 |
for doc in documents:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
chunks = text_splitter.get_nodes_from_documents([doc])
|
| 22 |
for i, chunk in enumerate(chunks):
|
| 23 |
chunk.metadata.update({
|
| 24 |
'chunk_id': i,
|
| 25 |
'total_chunks': len(chunks),
|
| 26 |
-
'chunk_size': len(chunk.text)
|
| 27 |
})
|
| 28 |
chunked.append(chunk)
|
| 29 |
|
|
|
|
| 18 |
|
| 19 |
chunked = []
|
| 20 |
for doc in documents:
|
| 21 |
+
# Add document ID to text for better BM25 matching
|
| 22 |
+
doc_id = doc.metadata.get('document_id', '')
|
| 23 |
+
if doc_id and doc_id not in doc.text[:200]:
|
| 24 |
+
doc.text = f"[Документ: {doc_id}]\n\n{doc.text}"
|
| 25 |
+
|
| 26 |
chunks = text_splitter.get_nodes_from_documents([doc])
|
| 27 |
for i, chunk in enumerate(chunks):
|
| 28 |
chunk.metadata.update({
|
| 29 |
'chunk_id': i,
|
| 30 |
'total_chunks': len(chunks),
|
| 31 |
+
'chunk_size': len(chunk.text)
|
| 32 |
})
|
| 33 |
chunked.append(chunk)
|
| 34 |
|
index_retriever.py
CHANGED
|
@@ -23,21 +23,86 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
|
|
| 23 |
filtered.append(node)
|
| 24 |
return filtered
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
def create_query_engine(vector_index):
|
| 27 |
-
"""Create hybrid retrieval engine with
|
| 28 |
log_message("Creating query engine...")
|
| 29 |
|
| 30 |
vector_retriever = VectorIndexRetriever(
|
| 31 |
index=vector_index,
|
| 32 |
-
similarity_top_k=50
|
| 33 |
)
|
| 34 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 35 |
docstore=vector_index.docstore,
|
| 36 |
similarity_top_k=50,
|
|
|
|
|
|
|
| 37 |
)
|
| 38 |
hybrid_retriever = QueryFusionRetriever(
|
| 39 |
[vector_retriever, bm25_retriever],
|
| 40 |
-
similarity_top_k=60,
|
| 41 |
num_queries=1
|
| 42 |
)
|
| 43 |
|
|
@@ -46,20 +111,33 @@ def create_query_engine(vector_index):
|
|
| 46 |
nodes = hybrid_retriever.retrieve(query)
|
| 47 |
log_message(f"Hybrid retrieval returned: {len(nodes)} nodes")
|
| 48 |
|
| 49 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
seen_hashes = set()
|
| 51 |
unique_nodes = []
|
| 52 |
doc_type_counts = {'text': 0, 'table': 0, 'image': 0}
|
| 53 |
|
| 54 |
for node in nodes:
|
| 55 |
-
# Use first 500 chars for dedup hash
|
| 56 |
text_hash = hash(node.text[:500])
|
| 57 |
|
| 58 |
if text_hash not in seen_hashes:
|
| 59 |
seen_hashes.add(text_hash)
|
| 60 |
unique_nodes.append(node)
|
| 61 |
|
| 62 |
-
# Count by type
|
| 63 |
node_type = node.metadata.get('type', 'text')
|
| 64 |
doc_type_counts[node_type] = doc_type_counts.get(node_type, 0) + 1
|
| 65 |
|
|
@@ -68,6 +146,10 @@ def create_query_engine(vector_index):
|
|
| 68 |
f"table={doc_type_counts.get('table', 0)}, "
|
| 69 |
f"image={doc_type_counts.get('image', 0)}")
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
return unique_nodes[:50]
|
| 72 |
|
| 73 |
response_synthesizer = get_response_synthesizer()
|
|
@@ -77,5 +159,5 @@ def create_query_engine(vector_index):
|
|
| 77 |
response_synthesizer=response_synthesizer
|
| 78 |
)
|
| 79 |
|
| 80 |
-
log_message("✓ Query engine created")
|
| 81 |
return query_engine
|
|
|
|
| 23 |
filtered.append(node)
|
| 24 |
return filtered
|
| 25 |
|
| 26 |
+
import re
|
| 27 |
+
|
| 28 |
+
def extract_doc_id_from_query(query):
|
| 29 |
+
"""Extract document IDs from query text"""
|
| 30 |
+
# Match patterns like: ГОСТ 59023.2, НП-104, ГОСТ Р 50.04.07-2022
|
| 31 |
+
patterns = [
|
| 32 |
+
r'(?:ГОСТ\s*Р?\s*)[\d\.]+(?:-\d{4})?', # ГОСТ patterns
|
| 33 |
+
r'НП-\d+(?:-\d+)?', # НП patterns
|
| 34 |
+
r'МУ[_\s][\d\.]+', # МУ patterns
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
found_ids = []
|
| 38 |
+
for pattern in patterns:
|
| 39 |
+
matches = re.findall(pattern, query, re.IGNORECASE)
|
| 40 |
+
found_ids.extend(matches)
|
| 41 |
+
|
| 42 |
+
# Normalize spacing
|
| 43 |
+
normalized = [re.sub(r'\s+', ' ', id.strip()) for id in found_ids]
|
| 44 |
+
return normalized
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.8):
|
| 48 |
+
"""Keep nodes that match any of the document IDs"""
|
| 49 |
+
if not doc_ids:
|
| 50 |
+
return nodes
|
| 51 |
+
|
| 52 |
+
from difflib import SequenceMatcher
|
| 53 |
+
|
| 54 |
+
filtered = []
|
| 55 |
+
for node in nodes:
|
| 56 |
+
node_doc_id = node.metadata.get('document_id', '').upper()
|
| 57 |
+
|
| 58 |
+
for query_doc_id in doc_ids:
|
| 59 |
+
query_doc_id = query_doc_id.upper()
|
| 60 |
+
|
| 61 |
+
# Exact substring match
|
| 62 |
+
if query_doc_id in node_doc_id or node_doc_id in query_doc_id:
|
| 63 |
+
filtered.append(node)
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
+
# Fuzzy match for close variants
|
| 67 |
+
similarity = SequenceMatcher(None, query_doc_id, node_doc_id).ratio()
|
| 68 |
+
if similarity >= threshold:
|
| 69 |
+
filtered.append(node)
|
| 70 |
+
break
|
| 71 |
+
|
| 72 |
+
return filtered
|
| 73 |
+
|
| 74 |
+
def russian_tokenizer(text):
|
| 75 |
+
"""Better tokenizer for Russian document IDs and technical terms"""
|
| 76 |
+
import re
|
| 77 |
+
|
| 78 |
+
# Keep document ID patterns intact
|
| 79 |
+
text = re.sub(r'(ГОСТ\s*Р?\s*[\d\.]+(?:-\d{4})?)', r' \1 ', text)
|
| 80 |
+
text = re.sub(r'(НП-\d+(?:-\d+)?)', r' \1 ', text)
|
| 81 |
+
text = re.sub(r'(МУ[_\s][\d\.]+)', r' \1 ', text)
|
| 82 |
+
|
| 83 |
+
# Split on whitespace and punctuation, but keep numbers with decimals
|
| 84 |
+
tokens = re.findall(r'\d+\.\d+|\w+', text.lower())
|
| 85 |
+
|
| 86 |
+
return tokens
|
| 87 |
+
|
| 88 |
+
|
| 89 |
def create_query_engine(vector_index):
|
| 90 |
+
"""Create hybrid retrieval engine with document ID filtering"""
|
| 91 |
log_message("Creating query engine...")
|
| 92 |
|
| 93 |
vector_retriever = VectorIndexRetriever(
|
| 94 |
index=vector_index,
|
| 95 |
+
similarity_top_k=50
|
| 96 |
)
|
| 97 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 98 |
docstore=vector_index.docstore,
|
| 99 |
similarity_top_k=50,
|
| 100 |
+
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 101 |
+
|
| 102 |
)
|
| 103 |
hybrid_retriever = QueryFusionRetriever(
|
| 104 |
[vector_retriever, bm25_retriever],
|
| 105 |
+
similarity_top_k=60,
|
| 106 |
num_queries=1
|
| 107 |
)
|
| 108 |
|
|
|
|
| 111 |
nodes = hybrid_retriever.retrieve(query)
|
| 112 |
log_message(f"Hybrid retrieval returned: {len(nodes)} nodes")
|
| 113 |
|
| 114 |
+
# Extract document IDs from query
|
| 115 |
+
doc_ids = extract_doc_id_from_query(query)
|
| 116 |
+
if doc_ids:
|
| 117 |
+
log_message(f"Detected document IDs in query: {doc_ids}")
|
| 118 |
+
|
| 119 |
+
# Filter by document ID
|
| 120 |
+
doc_filtered = filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.7)
|
| 121 |
+
log_message(f"After doc ID filter: {len(doc_filtered)} nodes")
|
| 122 |
+
|
| 123 |
+
# If we found matching documents, use only those
|
| 124 |
+
if doc_filtered:
|
| 125 |
+
nodes = doc_filtered
|
| 126 |
+
else:
|
| 127 |
+
log_message("WARNING: No nodes matched document IDs, using all results")
|
| 128 |
+
|
| 129 |
+
# Deduplication
|
| 130 |
seen_hashes = set()
|
| 131 |
unique_nodes = []
|
| 132 |
doc_type_counts = {'text': 0, 'table': 0, 'image': 0}
|
| 133 |
|
| 134 |
for node in nodes:
|
|
|
|
| 135 |
text_hash = hash(node.text[:500])
|
| 136 |
|
| 137 |
if text_hash not in seen_hashes:
|
| 138 |
seen_hashes.add(text_hash)
|
| 139 |
unique_nodes.append(node)
|
| 140 |
|
|
|
|
| 141 |
node_type = node.metadata.get('type', 'text')
|
| 142 |
doc_type_counts[node_type] = doc_type_counts.get(node_type, 0) + 1
|
| 143 |
|
|
|
|
| 146 |
f"table={doc_type_counts.get('table', 0)}, "
|
| 147 |
f"image={doc_type_counts.get('image', 0)}")
|
| 148 |
|
| 149 |
+
# Log which documents we're returning
|
| 150 |
+
returned_docs = set(n.metadata.get('document_id', 'unknown') for n in unique_nodes[:50])
|
| 151 |
+
log_message(f"Returning nodes from: {sorted(returned_docs)}")
|
| 152 |
+
|
| 153 |
return unique_nodes[:50]
|
| 154 |
|
| 155 |
response_synthesizer = get_response_synthesizer()
|
|
|
|
| 159 |
response_synthesizer=response_synthesizer
|
| 160 |
)
|
| 161 |
|
| 162 |
+
log_message("✓ Query engine created with doc ID filtering")
|
| 163 |
return query_engine
|