MrSimple07 commited on
Commit
8114c87
·
1 Parent(s): ae5a669

index retriever = 100 + 100

Browse files
Files changed (2) hide show
  1. index_retriever.py +4 -4
  2. utils.py +19 -13
index_retriever.py CHANGED
@@ -44,15 +44,15 @@ def create_query_engine(vector_index):
44
 
45
  vector_retriever = VectorIndexRetriever(
46
  index=vector_index,
47
- similarity_top_k=60 # Reduced from 50
48
  )
49
  bm25_retriever = BM25Retriever.from_defaults(
50
  docstore=vector_index.docstore,
51
- similarity_top_k=60 # Reduced from 50
52
  )
53
  hybrid_retriever = QueryFusionRetriever(
54
  [vector_retriever, bm25_retriever],
55
- similarity_top_k=80, # Reduced from 60
56
  num_queries=1
57
  )
58
 
@@ -72,7 +72,7 @@ def create_query_engine(vector_index):
72
  unique_nodes.append(node)
73
 
74
  log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
75
- return unique_nodes[:50] # Return top 50 unique
76
 
77
  response_synthesizer = get_response_synthesizer()
78
 
 
44
 
45
  vector_retriever = VectorIndexRetriever(
46
  index=vector_index,
47
+ similarity_top_k=80 # Reduced from 50
48
  )
49
  bm25_retriever = BM25Retriever.from_defaults(
50
  docstore=vector_index.docstore,
51
+ similarity_top_k=80 # Reduced from 50
52
  )
53
  hybrid_retriever = QueryFusionRetriever(
54
  [vector_retriever, bm25_retriever],
55
+ similarity_top_k=100, # Reduced from 60
56
  num_queries=1
57
  )
58
 
 
72
  unique_nodes.append(node)
73
 
74
  log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
75
+ return unique_nodes[:60] # Return top 50 unique
76
 
77
  response_synthesizer = get_response_synthesizer()
78
 
utils.py CHANGED
@@ -41,11 +41,10 @@ import re
41
 
42
  def extract_document_id(query):
43
  """Extract explicit document IDs from query"""
44
- # Patterns for common document formats
45
  patterns = [
46
- r'ГОСТ\s*Р?\s*[\d.-]+', # ГОСТ 59023.4, ГОСТ Р 59023.5-2020
47
- r'НП-\d+-\d+', # НП-105-18
48
- r'МУ[_\s][\d.]+', # МУ 1.1.4.01.1422-2019
49
  ]
50
 
51
  for pattern in patterns:
@@ -54,43 +53,51 @@ def extract_document_id(query):
54
  return match.group(0).strip()
55
  return None
56
 
 
 
 
 
 
 
 
57
  def answer_question(question, query_engine, reranker):
58
  try:
59
  log_message(f"\n{'='*70}")
60
  log_message(f"QUERY: {question}")
61
 
62
- # Check for explicit document reference
63
  target_doc_id = extract_document_id(question)
64
  if target_doc_id:
65
  log_message(f"TARGET DOCUMENT: {target_doc_id}")
66
 
67
- # Retrieve nodes
68
  retrieved = query_engine.retrieve(question)
69
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
70
 
71
- # Filter by document if explicitly mentioned
72
  if target_doc_id:
73
- # Normalize for comparison (remove spaces, case-insensitive)
74
- target_normalized = target_doc_id.replace(' ', '').lower()
75
 
76
  filtered = [
77
  node for node in retrieved
78
- if target_normalized in node.metadata.get('document_id', '').replace(' ', '').lower()
79
  ]
80
 
81
  log_message(f"FILTERED TO TARGET DOC: {len(filtered)} nodes")
82
 
 
 
 
 
 
83
  if not filtered:
84
  log_message(f"WARNING: No nodes found for {target_doc_id}")
85
  return f"В базе данных не найдены таблицы из документа {target_doc_id}.", ""
86
 
87
  retrieved = filtered
88
 
89
- # Rerank
90
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.25)
91
  log_message(f"RERANKED: {len(reranked)} nodes")
92
 
93
- # Rest of your existing code...
94
  context_parts = []
95
  for n in reranked:
96
  meta = n.metadata
@@ -119,7 +126,6 @@ def answer_question(question, query_engine, reranker):
119
 
120
  sources = format_sources(reranked)
121
 
122
- # Log retrieved chunks
123
  log_message(f"\n{'='*70}")
124
  log_message("RETRIEVED CHUNKS:")
125
  for i, node in enumerate(reranked, 1):
 
41
 
42
  def extract_document_id(query):
43
  """Extract explicit document IDs from query"""
 
44
  patterns = [
45
+ r'ГОСТ\s*Р?\s*[\d.-]+(?:-\d{4})?', # ГОСТ 59023.4 or ГОСТ Р 59023.5-2020
46
+ r'НП-\d+-\d+',
47
+ r'МУ[_\s][\d.]+',
48
  ]
49
 
50
  for pattern in patterns:
 
53
  return match.group(0).strip()
54
  return None
55
 
56
+ def normalize_doc_id(doc_id):
57
+ """Normalize document ID for flexible matching"""
58
+ normalized = doc_id.replace(' ', '').replace('Р', '').replace('р', '').lower()
59
+ # Remove year suffix for comparison (e.g., -2020)
60
+ normalized = re.sub(r'-\d{4}$', '', normalized)
61
+ return normalized
62
+
63
  def answer_question(question, query_engine, reranker):
64
  try:
65
  log_message(f"\n{'='*70}")
66
  log_message(f"QUERY: {question}")
67
 
 
68
  target_doc_id = extract_document_id(question)
69
  if target_doc_id:
70
  log_message(f"TARGET DOCUMENT: {target_doc_id}")
71
 
 
72
  retrieved = query_engine.retrieve(question)
73
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
74
 
 
75
  if target_doc_id:
76
+ target_normalized = normalize_doc_id(target_doc_id)
77
+ log_message(f"NORMALIZED TARGET: {target_normalized}")
78
 
79
  filtered = [
80
  node for node in retrieved
81
+ if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
82
  ]
83
 
84
  log_message(f"FILTERED TO TARGET DOC: {len(filtered)} nodes")
85
 
86
+ # Debug: show what document IDs were found
87
+ if not filtered and len(retrieved) > 0:
88
+ found_docs = set(node.metadata.get('document_id', 'unknown') for node in retrieved[:10])
89
+ log_message(f"AVAILABLE DOCS (sample): {', '.join(list(found_docs)[:5])}")
90
+
91
  if not filtered:
92
  log_message(f"WARNING: No nodes found for {target_doc_id}")
93
  return f"В базе данных не найдены таблицы из документа {target_doc_id}.", ""
94
 
95
  retrieved = filtered
96
 
97
+ # Rest stays the same...
98
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.25)
99
  log_message(f"RERANKED: {len(reranked)} nodes")
100
 
 
101
  context_parts = []
102
  for n in reranked:
103
  meta = n.metadata
 
126
 
127
  sources = format_sources(reranked)
128
 
 
129
  log_message(f"\n{'='*70}")
130
  log_message("RETRIEVED CHUNKS:")
131
  for i, node in enumerate(reranked, 1):