MrSimple07 commited on
Commit
ad8e8ec
·
1 Parent(s): c33deff

removed normalization doc id

Browse files
Files changed (3) hide show
  1. documents_prep.py +2 -2
  2. index_retriever.py +13 -19
  3. utils.py +2 -2
documents_prep.py CHANGED
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=2000):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
@@ -222,7 +222,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
222
  for sheet in data.get('sheets', []):
223
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
224
 
225
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2000)
226
  all_chunks.extend(chunks)
227
 
228
  except Exception as e:
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_content(table_data, doc_id, max_chars=1024):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
 
222
  for sheet in data.get('sheets', []):
223
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
224
 
225
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1024)
226
  all_chunks.extend(chunks)
227
 
228
  except Exception as e:
index_retriever.py CHANGED
@@ -6,21 +6,6 @@ from llama_index.core.retrievers import QueryFusionRetriever
6
  from llama_index.core.response_synthesizers import get_response_synthesizer
7
  from my_logging import log_message
8
 
9
- SIMPLE_PROMPT = """Вы - эксперт по нормативной документации.
10
-
11
- Контекст:
12
- {context_str}
13
-
14
- Вопрос: {query_str}
15
-
16
- Инструкция:
17
- 1. Отвечайте ТОЛЬКО на основе предоставленного контекста
18
- 2. Цитируйте конкретные источники (документ, раздел, таблицу)
19
- 3. Если информации недостаточно, четко укажите это
20
- 4. Будьте точны и конкретны
21
-
22
- Ответ:"""
23
-
24
  def create_vector_index(documents):
25
  """Create vector index from documents"""
26
  log_message(f"Building vector index from {len(documents)} documents...")
@@ -44,15 +29,15 @@ def create_query_engine(vector_index):
44
 
45
  vector_retriever = VectorIndexRetriever(
46
  index=vector_index,
47
- similarity_top_k=80 # Reduced from 50
48
  )
49
  bm25_retriever = BM25Retriever.from_defaults(
50
  docstore=vector_index.docstore,
51
- similarity_top_k=80 # Reduced from 50
52
  )
53
  hybrid_retriever = QueryFusionRetriever(
54
  [vector_retriever, bm25_retriever],
55
- similarity_top_k=100, # Reduced from 60
56
  num_queries=1
57
  )
58
 
@@ -73,11 +58,20 @@ def create_query_engine(vector_index):
73
 
74
  log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
75
  return unique_nodes[:50] # Return top 50 unique
 
 
 
 
 
 
 
 
 
76
 
77
  response_synthesizer = get_response_synthesizer()
78
 
79
  query_engine = DeduplicatedQueryEngine(
80
- retriever=hybrid_retriever,
81
  response_synthesizer=response_synthesizer
82
  )
83
 
 
6
  from llama_index.core.response_synthesizers import get_response_synthesizer
7
  from my_logging import log_message
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def create_vector_index(documents):
10
  """Create vector index from documents"""
11
  log_message(f"Building vector index from {len(documents)} documents...")
 
29
 
30
  vector_retriever = VectorIndexRetriever(
31
  index=vector_index,
32
+ similarity_top_k=80
33
  )
34
  bm25_retriever = BM25Retriever.from_defaults(
35
  docstore=vector_index.docstore,
36
+ similarity_top_k=80,
37
  )
38
  hybrid_retriever = QueryFusionRetriever(
39
  [vector_retriever, bm25_retriever],
40
+ similarity_top_k=100,
41
  num_queries=1
42
  )
43
 
 
58
 
59
  log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
60
  return unique_nodes[:50] # Return top 50 unique
61
+
62
+ # FIX: Override query method to use our retrieve
63
+ def query(self, query_bundle):
64
+ nodes = self.retrieve(query_bundle.query_str)
65
+ response = self._response_synthesizer.synthesize(
66
+ query=query_bundle,
67
+ nodes=nodes
68
+ )
69
+ return response
70
 
71
  response_synthesizer = get_response_synthesizer()
72
 
73
  query_engine = DeduplicatedQueryEngine(
74
+ retriever=hybrid_retriever, # Still pass it but we override retrieve()
75
  response_synthesizer=response_synthesizer
76
  )
77
 
utils.py CHANGED
@@ -47,7 +47,7 @@ def answer_question(question, query_engine, reranker):
47
  retrieved = query_engine.retrieve(question)
48
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
49
 
50
- reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=0.3)
51
  log_message(f"RERANKED: {len(reranked)} nodes")
52
 
53
  context_parts = []
@@ -83,7 +83,7 @@ def answer_question(question, query_engine, reranker):
83
  log_message(traceback.format_exc())
84
  return f"Ошибка: {e}", ""
85
 
86
- def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.3):
87
  """Simple and effective reranking: sort by score and filter by threshold."""
88
  if not nodes or not reranker:
89
  return nodes[:top_k]
 
47
  retrieved = query_engine.retrieve(question)
48
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
49
 
50
+ reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.1)
51
  log_message(f"RERANKED: {len(reranked)} nodes")
52
 
53
  context_parts = []
 
83
  log_message(traceback.format_exc())
84
  return f"Ошибка: {e}", ""
85
 
86
+ def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.1):
87
  """Simple and effective reranking: sort by score and filter by threshold."""
88
  if not nodes or not reranker:
89
  return nodes[:top_k]