MrSimple07 commited on
Commit
31659d7
·
1 Parent(s): 8114c87

index retriever = 100 + 100

Browse files
Files changed (3) hide show
  1. documents_prep.py +2 -2
  2. index_retriever.py +1 -1
  3. utils.py +6 -7
documents_prep.py CHANGED
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=1500):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
@@ -222,7 +222,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
222
  for sheet in data.get('sheets', []):
223
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
224
 
225
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1500)
226
  all_chunks.extend(chunks)
227
 
228
  except Exception as e:
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
 
222
  for sheet in data.get('sheets', []):
223
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
224
 
225
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2000)
226
  all_chunks.extend(chunks)
227
 
228
  except Exception as e:
index_retriever.py CHANGED
@@ -72,7 +72,7 @@ def create_query_engine(vector_index):
72
  unique_nodes.append(node)
73
 
74
  log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
75
- return unique_nodes[:60] # Return top 50 unique
76
 
77
  response_synthesizer = get_response_synthesizer()
78
 
 
72
  unique_nodes.append(node)
73
 
74
  log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
75
+ return unique_nodes[:50] # Return top 50 unique
76
 
77
  response_synthesizer = get_response_synthesizer()
78
 
utils.py CHANGED
@@ -54,10 +54,9 @@ def extract_document_id(query):
54
  return None
55
 
56
  def normalize_doc_id(doc_id):
57
- """Normalize document ID for flexible matching"""
58
- normalized = doc_id.replace(' ', '').replace('Р', '').replace('р', '').lower()
59
- # Remove year suffix for comparison (e.g., -2020)
60
  normalized = re.sub(r'-\d{4}$', '', normalized)
 
61
  return normalized
62
 
63
  def answer_question(question, query_engine, reranker):
@@ -66,6 +65,8 @@ def answer_question(question, query_engine, reranker):
66
  log_message(f"QUERY: {question}")
67
 
68
  target_doc_id = extract_document_id(question)
 
 
69
  if target_doc_id:
70
  log_message(f"TARGET DOCUMENT: {target_doc_id}")
71
 
@@ -74,8 +75,6 @@ def answer_question(question, query_engine, reranker):
74
 
75
  if target_doc_id:
76
  target_normalized = normalize_doc_id(target_doc_id)
77
- log_message(f"NORMALIZED TARGET: {target_normalized}")
78
-
79
  filtered = [
80
  node for node in retrieved
81
  if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
@@ -95,7 +94,7 @@ def answer_question(question, query_engine, reranker):
95
  retrieved = filtered
96
 
97
  # Rest stays the same...
98
- reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.25)
99
  log_message(f"RERANKED: {len(reranked)} nodes")
100
 
101
  context_parts = []
@@ -146,7 +145,7 @@ def answer_question(question, query_engine, reranker):
146
  log_message(traceback.format_exc())
147
  return f"Ошибка: {e}", ""
148
 
149
- def rerank_nodes(query, nodes, reranker, top_k=25, min_score=0.3):
150
  """Simple and effective reranking: sort by score and filter by threshold."""
151
  if not nodes or not reranker:
152
  return nodes[:top_k]
 
54
  return None
55
 
56
  def normalize_doc_id(doc_id):
57
+ normalized = doc_id.replace(' ', '').replace('р', '').replace('Р', '').lower()
 
 
58
  normalized = re.sub(r'-\d{4}$', '', normalized)
59
+ normalized = normalized.replace('.', '') # Remove dots for flexible matching
60
  return normalized
61
 
62
  def answer_question(question, query_engine, reranker):
 
65
  log_message(f"QUERY: {question}")
66
 
67
  target_doc_id = extract_document_id(question)
68
+ found_docs = set(normalize_doc_id(node.metadata.get('document_id', 'unknown')) for node in query_engine.retrieve(question))
69
+ log_message(f"NORMALIZED DOCS IN RETRIEVED: {', '.join(list(found_docs))}")
70
  if target_doc_id:
71
  log_message(f"TARGET DOCUMENT: {target_doc_id}")
72
 
 
75
 
76
  if target_doc_id:
77
  target_normalized = normalize_doc_id(target_doc_id)
 
 
78
  filtered = [
79
  node for node in retrieved
80
  if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
 
94
  retrieved = filtered
95
 
96
  # Rest stays the same...
97
+ reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=0.25)
98
  log_message(f"RERANKED: {len(reranked)} nodes")
99
 
100
  context_parts = []
 
145
  log_message(traceback.format_exc())
146
  return f"Ошибка: {e}", ""
147
 
148
+ def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.3):
149
  """Simple and effective reranking: sort by score and filter by threshold."""
150
  if not nodes or not reranker:
151
  return nodes[:top_k]