MrSimple07 commited on
Commit
ae5a669
·
1 Parent(s): dfc7ba2

max_chars = 1500 + doc id retriever

Browse files
Files changed (3) hide show
  1. documents_prep.py +2 -2
  2. index_retriever.py +3 -3
  3. utils.py +44 -5
documents_prep.py CHANGED
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=1200):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
@@ -222,7 +222,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
222
  for sheet in data.get('sheets', []):
223
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
224
 
225
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
226
  all_chunks.extend(chunks)
227
 
228
  except Exception as e:
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_content(table_data, doc_id, max_chars=1500):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
 
222
  for sheet in data.get('sheets', []):
223
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
224
 
225
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1500)
226
  all_chunks.extend(chunks)
227
 
228
  except Exception as e:
index_retriever.py CHANGED
@@ -44,15 +44,15 @@ def create_query_engine(vector_index):
44
 
45
  vector_retriever = VectorIndexRetriever(
46
  index=vector_index,
47
- similarity_top_k=50 # Reduced from 50
48
  )
49
  bm25_retriever = BM25Retriever.from_defaults(
50
  docstore=vector_index.docstore,
51
- similarity_top_k=50 # Reduced from 50
52
  )
53
  hybrid_retriever = QueryFusionRetriever(
54
  [vector_retriever, bm25_retriever],
55
- similarity_top_k=60, # Reduced from 60
56
  num_queries=1
57
  )
58
 
 
44
 
45
  vector_retriever = VectorIndexRetriever(
46
  index=vector_index,
47
+ similarity_top_k=60 # Reduced from 50
48
  )
49
  bm25_retriever = BM25Retriever.from_defaults(
50
  docstore=vector_index.docstore,
51
+ similarity_top_k=60 # Reduced from 50
52
  )
53
  hybrid_retriever = QueryFusionRetriever(
54
  [vector_retriever, bm25_retriever],
55
+ similarity_top_k=80, # Reduced from 60
56
  num_queries=1
57
  )
58
 
utils.py CHANGED
@@ -37,20 +37,60 @@ def format_sources(nodes):
37
 
38
  return "\n".join(set(sources))
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def answer_question(question, query_engine, reranker):
41
  try:
42
  log_message(f"\n{'='*70}")
43
  log_message(f"QUERY: {question}")
44
 
45
- # Retrieve nodes (already deduplicated)
 
 
 
 
 
46
  retrieved = query_engine.retrieve(question)
47
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Rerank
50
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.25)
51
  log_message(f"RERANKED: {len(reranked)} nodes")
52
 
53
- # Build context - NO TRUNCATION
54
  context_parts = []
55
  for n in reranked:
56
  meta = n.metadata
@@ -66,7 +106,7 @@ def answer_question(question, query_engine, reranker):
66
  else:
67
  source_label = f"[{doc_id}]"
68
 
69
- context_parts.append(f"{source_label}\n{n.text}") # Full text
70
 
71
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
72
 
@@ -79,7 +119,7 @@ def answer_question(question, query_engine, reranker):
79
 
80
  sources = format_sources(reranked)
81
 
82
- # Log retrieved chunks WITHOUT duplicates
83
  log_message(f"\n{'='*70}")
84
  log_message("RETRIEVED CHUNKS:")
85
  for i, node in enumerate(reranked, 1):
@@ -99,7 +139,6 @@ def answer_question(question, query_engine, reranker):
99
  import traceback
100
  log_message(traceback.format_exc())
101
  return f"Ошибка: {e}", ""
102
-
103
 
104
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score=0.3):
105
  """Simple and effective reranking: sort by score and filter by threshold."""
 
37
 
38
  return "\n".join(set(sources))
39
 
40
+ import re
41
+
42
+ def extract_document_id(query):
43
+ """Extract explicit document IDs from query"""
44
+ # Patterns for common document formats
45
+ patterns = [
46
+ r'ГОСТ\s*Р?\s*[\d.-]+', # ГОСТ 59023.4, ГОСТ Р 59023.5-2020
47
+ r'НП-\d+-\d+', # НП-105-18
48
+ r'МУ[_\s][\d.]+', # МУ 1.1.4.01.1422-2019
49
+ ]
50
+
51
+ for pattern in patterns:
52
+ match = re.search(pattern, query, re.IGNORECASE)
53
+ if match:
54
+ return match.group(0).strip()
55
+ return None
56
+
57
  def answer_question(question, query_engine, reranker):
58
  try:
59
  log_message(f"\n{'='*70}")
60
  log_message(f"QUERY: {question}")
61
 
62
+ # Check for explicit document reference
63
+ target_doc_id = extract_document_id(question)
64
+ if target_doc_id:
65
+ log_message(f"TARGET DOCUMENT: {target_doc_id}")
66
+
67
+ # Retrieve nodes
68
  retrieved = query_engine.retrieve(question)
69
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
70
 
71
+ # Filter by document if explicitly mentioned
72
+ if target_doc_id:
73
+ # Normalize for comparison (remove spaces, case-insensitive)
74
+ target_normalized = target_doc_id.replace(' ', '').lower()
75
+
76
+ filtered = [
77
+ node for node in retrieved
78
+ if target_normalized in node.metadata.get('document_id', '').replace(' ', '').lower()
79
+ ]
80
+
81
+ log_message(f"FILTERED TO TARGET DOC: {len(filtered)} nodes")
82
+
83
+ if not filtered:
84
+ log_message(f"WARNING: No nodes found for {target_doc_id}")
85
+ return f"В базе данных не найдены таблицы из документа {target_doc_id}.", ""
86
+
87
+ retrieved = filtered
88
+
89
  # Rerank
90
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.25)
91
  log_message(f"RERANKED: {len(reranked)} nodes")
92
 
93
+ # Rest of your existing code...
94
  context_parts = []
95
  for n in reranked:
96
  meta = n.metadata
 
106
  else:
107
  source_label = f"[{doc_id}]"
108
 
109
+ context_parts.append(f"{source_label}\n{n.text}")
110
 
111
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
112
 
 
119
 
120
  sources = format_sources(reranked)
121
 
122
+ # Log retrieved chunks
123
  log_message(f"\n{'='*70}")
124
  log_message("RETRIEVED CHUNKS:")
125
  for i, node in enumerate(reranked, 1):
 
139
  import traceback
140
  log_message(traceback.format_exc())
141
  return f"Ошибка: {e}", ""
 
142
 
143
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score=0.3):
144
  """Simple and effective reranking: sort by score and filter by threshold."""