Spaces:
Sleeping
Sleeping
Commit
·
8114c87
1
Parent(s):
ae5a669
index retriever = 100 + 100
Browse files- index_retriever.py +4 -4
- utils.py +19 -13
index_retriever.py
CHANGED
|
@@ -44,15 +44,15 @@ def create_query_engine(vector_index):
|
|
| 44 |
|
| 45 |
vector_retriever = VectorIndexRetriever(
|
| 46 |
index=vector_index,
|
| 47 |
-
similarity_top_k=
|
| 48 |
)
|
| 49 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 50 |
docstore=vector_index.docstore,
|
| 51 |
-
similarity_top_k=
|
| 52 |
)
|
| 53 |
hybrid_retriever = QueryFusionRetriever(
|
| 54 |
[vector_retriever, bm25_retriever],
|
| 55 |
-
similarity_top_k=
|
| 56 |
num_queries=1
|
| 57 |
)
|
| 58 |
|
|
@@ -72,7 +72,7 @@ def create_query_engine(vector_index):
|
|
| 72 |
unique_nodes.append(node)
|
| 73 |
|
| 74 |
log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
|
| 75 |
-
return unique_nodes[:
|
| 76 |
|
| 77 |
response_synthesizer = get_response_synthesizer()
|
| 78 |
|
|
|
|
| 44 |
|
| 45 |
vector_retriever = VectorIndexRetriever(
|
| 46 |
index=vector_index,
|
| 47 |
+
similarity_top_k=80 # Reduced from 50
|
| 48 |
)
|
| 49 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 50 |
docstore=vector_index.docstore,
|
| 51 |
+
similarity_top_k=80 # Reduced from 50
|
| 52 |
)
|
| 53 |
hybrid_retriever = QueryFusionRetriever(
|
| 54 |
[vector_retriever, bm25_retriever],
|
| 55 |
+
similarity_top_k=100, # Reduced from 60
|
| 56 |
num_queries=1
|
| 57 |
)
|
| 58 |
|
|
|
|
| 72 |
unique_nodes.append(node)
|
| 73 |
|
| 74 |
log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
|
| 75 |
+
return unique_nodes[:60] # Return top 50 unique
|
| 76 |
|
| 77 |
response_synthesizer = get_response_synthesizer()
|
| 78 |
|
utils.py
CHANGED
|
@@ -41,11 +41,10 @@ import re
|
|
| 41 |
|
| 42 |
def extract_document_id(query):
|
| 43 |
"""Extract explicit document IDs from query"""
|
| 44 |
-
# Patterns for common document formats
|
| 45 |
patterns = [
|
| 46 |
-
r'ГОСТ\s*Р?\s*[\d.-]+', # ГОСТ 59023.4
|
| 47 |
-
r'НП-\d+-\d+',
|
| 48 |
-
r'МУ[_\s][\d.]+',
|
| 49 |
]
|
| 50 |
|
| 51 |
for pattern in patterns:
|
|
@@ -54,43 +53,51 @@ def extract_document_id(query):
|
|
| 54 |
return match.group(0).strip()
|
| 55 |
return None
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def answer_question(question, query_engine, reranker):
|
| 58 |
try:
|
| 59 |
log_message(f"\n{'='*70}")
|
| 60 |
log_message(f"QUERY: {question}")
|
| 61 |
|
| 62 |
-
# Check for explicit document reference
|
| 63 |
target_doc_id = extract_document_id(question)
|
| 64 |
if target_doc_id:
|
| 65 |
log_message(f"TARGET DOCUMENT: {target_doc_id}")
|
| 66 |
|
| 67 |
-
# Retrieve nodes
|
| 68 |
retrieved = query_engine.retrieve(question)
|
| 69 |
log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
|
| 70 |
|
| 71 |
-
# Filter by document if explicitly mentioned
|
| 72 |
if target_doc_id:
|
| 73 |
-
|
| 74 |
-
|
| 75 |
|
| 76 |
filtered = [
|
| 77 |
node for node in retrieved
|
| 78 |
-
if target_normalized in node.metadata.get('document_id', '')
|
| 79 |
]
|
| 80 |
|
| 81 |
log_message(f"FILTERED TO TARGET DOC: {len(filtered)} nodes")
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
if not filtered:
|
| 84 |
log_message(f"WARNING: No nodes found for {target_doc_id}")
|
| 85 |
return f"В базе данных не найдены таблицы из документа {target_doc_id}.", ""
|
| 86 |
|
| 87 |
retrieved = filtered
|
| 88 |
|
| 89 |
-
#
|
| 90 |
reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.25)
|
| 91 |
log_message(f"RERANKED: {len(reranked)} nodes")
|
| 92 |
|
| 93 |
-
# Rest of your existing code...
|
| 94 |
context_parts = []
|
| 95 |
for n in reranked:
|
| 96 |
meta = n.metadata
|
|
@@ -119,7 +126,6 @@ def answer_question(question, query_engine, reranker):
|
|
| 119 |
|
| 120 |
sources = format_sources(reranked)
|
| 121 |
|
| 122 |
-
# Log retrieved chunks
|
| 123 |
log_message(f"\n{'='*70}")
|
| 124 |
log_message("RETRIEVED CHUNKS:")
|
| 125 |
for i, node in enumerate(reranked, 1):
|
|
|
|
| 41 |
|
| 42 |
def extract_document_id(query):
|
| 43 |
"""Extract explicit document IDs from query"""
|
|
|
|
| 44 |
patterns = [
|
| 45 |
+
r'ГОСТ\s*Р?\s*[\d.-]+(?:-\d{4})?', # ГОСТ 59023.4 or ГОСТ Р 59023.5-2020
|
| 46 |
+
r'НП-\d+-\d+',
|
| 47 |
+
r'МУ[_\s][\d.]+',
|
| 48 |
]
|
| 49 |
|
| 50 |
for pattern in patterns:
|
|
|
|
| 53 |
return match.group(0).strip()
|
| 54 |
return None
|
| 55 |
|
| 56 |
+
def normalize_doc_id(doc_id):
|
| 57 |
+
"""Normalize document ID for flexible matching"""
|
| 58 |
+
normalized = doc_id.replace(' ', '').replace('Р', '').replace('р', '').lower()
|
| 59 |
+
# Remove year suffix for comparison (e.g., -2020)
|
| 60 |
+
normalized = re.sub(r'-\d{4}$', '', normalized)
|
| 61 |
+
return normalized
|
| 62 |
+
|
| 63 |
def answer_question(question, query_engine, reranker):
|
| 64 |
try:
|
| 65 |
log_message(f"\n{'='*70}")
|
| 66 |
log_message(f"QUERY: {question}")
|
| 67 |
|
|
|
|
| 68 |
target_doc_id = extract_document_id(question)
|
| 69 |
if target_doc_id:
|
| 70 |
log_message(f"TARGET DOCUMENT: {target_doc_id}")
|
| 71 |
|
|
|
|
| 72 |
retrieved = query_engine.retrieve(question)
|
| 73 |
log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
|
| 74 |
|
|
|
|
| 75 |
if target_doc_id:
|
| 76 |
+
target_normalized = normalize_doc_id(target_doc_id)
|
| 77 |
+
log_message(f"NORMALIZED TARGET: {target_normalized}")
|
| 78 |
|
| 79 |
filtered = [
|
| 80 |
node for node in retrieved
|
| 81 |
+
if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
|
| 82 |
]
|
| 83 |
|
| 84 |
log_message(f"FILTERED TO TARGET DOC: {len(filtered)} nodes")
|
| 85 |
|
| 86 |
+
# Debug: show what document IDs were found
|
| 87 |
+
if not filtered and len(retrieved) > 0:
|
| 88 |
+
found_docs = set(node.metadata.get('document_id', 'unknown') for node in retrieved[:10])
|
| 89 |
+
log_message(f"AVAILABLE DOCS (sample): {', '.join(list(found_docs)[:5])}")
|
| 90 |
+
|
| 91 |
if not filtered:
|
| 92 |
log_message(f"WARNING: No nodes found for {target_doc_id}")
|
| 93 |
return f"В базе данных не найдены таблицы из документа {target_doc_id}.", ""
|
| 94 |
|
| 95 |
retrieved = filtered
|
| 96 |
|
| 97 |
+
# Rest stays the same...
|
| 98 |
reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.25)
|
| 99 |
log_message(f"RERANKED: {len(reranked)} nodes")
|
| 100 |
|
|
|
|
| 101 |
context_parts = []
|
| 102 |
for n in reranked:
|
| 103 |
meta = n.metadata
|
|
|
|
| 126 |
|
| 127 |
sources = format_sources(reranked)
|
| 128 |
|
|
|
|
| 129 |
log_message(f"\n{'='*70}")
|
| 130 |
log_message("RETRIEVED CHUNKS:")
|
| 131 |
for i, node in enumerate(reranked, 1):
|