MrSimple07 commited on
Commit
a42e1ff
·
1 Parent(s): 40de98c

eski holat with utils

Browse files
Files changed (3) hide show
  1. index_retriever.py +15 -50
  2. table_prep.py +43 -119
  3. utils.py +7 -31
index_retriever.py CHANGED
@@ -12,7 +12,7 @@ def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
  return VectorStoreIndex.from_documents(documents)
14
 
15
- def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, diversity_penalty=0.3):
16
  if not nodes or not reranker:
17
  return nodes[:top_k]
18
 
@@ -25,53 +25,16 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
25
 
26
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
27
 
28
- if min_score_threshold is not None:
29
- scored_nodes = [(node, score) for node, score in scored_nodes
30
- if score >= min_score_threshold]
31
- log_message(f"После фильтрации по порогу {min_score_threshold}: {len(scored_nodes)} узлов")
32
 
33
- if not scored_nodes:
34
- log_message("Нет узлов после фильтрации, снижаю порог")
35
- scored_nodes = list(zip(nodes, scores))
36
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
37
- min_score_threshold = scored_nodes[0][1] * 0.6
38
- scored_nodes = [(node, score) for node, score in scored_nodes
39
- if score >= min_score_threshold]
40
 
41
- selected_nodes = []
42
- selected_docs = set()
43
- selected_sections = set()
44
 
45
- for node, score in scored_nodes:
46
- if len(selected_nodes) >= top_k:
47
- break
48
-
49
- metadata = node.metadata if hasattr(node, 'metadata') else {}
50
- doc_id = metadata.get('document_id', 'unknown')
51
- section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
52
-
53
- # Apply diversity penalty
54
- penalty = 0
55
- if doc_id in selected_docs:
56
- penalty += diversity_penalty * 0.5
57
- if section_key in selected_sections:
58
- penalty += diversity_penalty
59
-
60
- adjusted_score = score * (1 - penalty)
61
-
62
- # Add if still competitive
63
- if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.6:
64
- selected_nodes.append((node, score))
65
- selected_docs.add(doc_id)
66
- selected_sections.add(section_key)
67
-
68
- log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
69
- log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}")
70
-
71
- if selected_nodes:
72
- log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
73
-
74
- return [node for node, score in selected_nodes]
75
 
76
  except Exception as e:
77
  log_message(f"Ошибка переранжировки: {str(e)}")
@@ -79,26 +42,28 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
79
 
80
  def create_query_engine(vector_index):
81
  try:
 
 
82
  bm25_retriever = BM25Retriever.from_defaults(
83
  docstore=vector_index.docstore,
84
- similarity_top_k=20
85
  )
86
 
87
  vector_retriever = VectorIndexRetriever(
88
  index=vector_index,
89
- similarity_top_k=30,
90
  similarity_cutoff=0.65
91
  )
92
 
93
  hybrid_retriever = QueryFusionRetriever(
94
  [vector_retriever, bm25_retriever],
95
- similarity_top_k=40,
96
  num_queries=1
97
  )
98
 
99
- custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
100
  response_synthesizer = get_response_synthesizer(
101
- response_mode=ResponseMode.TREE_SUMMARIZE,
102
  text_qa_template=custom_prompt_template
103
  )
104
 
 
12
  log_message("Строю векторный индекс")
13
  return VectorStoreIndex.from_documents(documents)
14
 
15
+ def rerank_nodes(query, nodes, reranker, top_k=20, min_score_threshold=0.5):
16
  if not nodes or not reranker:
17
  return nodes[:top_k]
18
 
 
25
 
26
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
27
 
28
+ # Apply threshold
29
+ filtered = [(node, score) for node, score in scored_nodes if score >= min_score_threshold]
 
 
30
 
31
+ if not filtered:
32
+ # Lower threshold if nothing passes
33
+ filtered = scored_nodes[:top_k]
 
 
 
 
34
 
35
+ log_message(f"Выбрано {min(len(filtered), top_k)} узлов")
 
 
36
 
37
+ return [node for node, score in filtered[:top_k]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  except Exception as e:
40
  log_message(f"Ошибка переранжировки: {str(e)}")
 
42
 
43
  def create_query_engine(vector_index):
44
  try:
45
+ from config import CUSTOM_PROMPT
46
+
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
+ similarity_top_k=40
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
+ similarity_top_k=40,
55
  similarity_cutoff=0.65
56
  )
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
+ similarity_top_k=40,
61
  num_queries=1
62
  )
63
 
64
+ custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
65
  response_synthesizer = get_response_synthesizer(
66
+ response_mode=ResponseMode.TREE_SUMMARIZE,
67
  text_qa_template=custom_prompt_template
68
  )
69
 
table_prep.py CHANGED
@@ -32,36 +32,12 @@ def create_table_content(table_data):
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
- def extract_table_metadata(table_text: str) -> dict:
36
- words = table_text.split()
37
- unique_words = set(words)
38
-
39
- from collections import Counter
40
- stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
41
- filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
42
- common = Counter(filtered).most_common(15)
43
- key_terms = [w for w, _ in common]
44
-
45
- return {
46
- "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
47
- "materials": [], # if you want to extract material names, hook in regex or LLM here
48
- "key_terms": key_terms
49
- }
50
-
51
- def chunk_table_document(doc, chunk_size=None, chunk_overlap=None, rows_per_chunk=4):
52
- if chunk_size is None:
53
- chunk_size = CHUNK_SIZE
54
- if chunk_overlap is None:
55
- chunk_overlap = CHUNK_OVERLAP
56
 
57
- # Extract critical metadata from table before chunking
58
- table_metadata = extract_table_metadata(doc.text)
59
  table_num = doc.metadata.get('table_number', 'unknown')
60
- table_title = doc.metadata.get('table_title', 'unknown')
61
- doc_id = doc.metadata.get('document_id', 'unknown')
62
- section = doc.metadata.get('section', 'unknown')
63
 
64
- # Parse table structure
65
  lines = doc.text.strip().split('\n')
66
 
67
  table_header_lines = []
@@ -80,109 +56,59 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None, rows_per_chun
80
  table_header = '\n'.join(table_header_lines) + '\n'
81
 
82
  if not data_rows:
83
- log_message(f" ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
84
- text_splitter = SentenceSplitter(
85
- chunk_size=chunk_size,
86
- chunk_overlap=chunk_overlap,
87
- separator="\n"
88
- )
89
- text_chunks = text_splitter.split_text(doc.text)
90
- log_message(f" 📊 Стандартное разбиение: {len(text_chunks)} чанков")
91
- else:
92
- log_message(f" 📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
93
-
94
- header_size = len(table_header)
95
- available_size = chunk_size - header_size - 300 # Reserve for enrichment
96
-
97
- text_chunks = []
98
- current_chunk_rows = []
99
- current_size = 0
100
 
101
- for row in data_rows:
102
- row_size = len(row) + 1
 
103
 
104
- # If single row exceeds available size, split it
105
- if row_size > available_size:
106
- log_message(f" ⚠️ Строка слишком длинная ({row_size} символов), разбиваем внутри строки")
107
-
108
- # Flush current chunk if exists
109
- if current_chunk_rows:
110
- chunk_text = table_header + '\n'.join(current_chunk_rows)
111
- text_chunks.append(chunk_text)
112
- log_message(f" ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
113
- current_chunk_rows = []
114
- current_size = 0
115
-
116
- # Split the oversized row
117
- text_splitter = SentenceSplitter(
118
- chunk_size=available_size,
119
- chunk_overlap=100,
120
- separator=" | "
121
- )
122
- row_parts = text_splitter.split_text(row)
123
- log_message(f" Строка разделена на {len(row_parts)} частей")
124
-
125
- for part in row_parts:
126
- chunk_text = table_header + part
127
- text_chunks.append(chunk_text)
128
- log_message(f" Под-чанк создан: {len(chunk_text)} символов")
129
-
130
- continue
131
-
132
- # Check if adding row would exceed rows_per_chunk OR size limit
133
- if (len(current_chunk_rows) >= rows_per_chunk or
134
- (current_size + row_size > available_size)) and current_chunk_rows:
135
-
136
- chunk_text = table_header + '\n'.join(current_chunk_rows)
137
- text_chunks.append(chunk_text)
138
- log_message(f" ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
139
-
140
- # Overlap: keep last 1 row
141
- overlap_count = min(1, len(current_chunk_rows))
142
- current_chunk_rows = current_chunk_rows[-overlap_count:]
143
- current_size = sum(len(r) + 1 for r in current_chunk_rows)
144
 
145
- current_chunk_rows.append(row)
146
- current_size += row_size
 
 
 
 
 
147
 
148
- # Final chunk
149
- if current_chunk_rows:
150
- chunk_text = table_header + '\n'.join(current_chunk_rows)
151
- text_chunks.append(chunk_text)
152
- log_message(f" ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
153
 
154
- log_message(f" 📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
 
 
 
 
155
 
156
- # Create enriched chunks (rest of the function remains the same)
157
- chunked_docs = []
158
- materials = table_metadata.get("materials", [])
159
- key_terms = table_metadata.get("key_terms", [])
160
 
161
- for i, chunk_text in enumerate(text_chunks):
 
 
162
  chunk_metadata = doc.metadata.copy()
163
  chunk_metadata.update({
164
  "chunk_id": i,
165
- "total_chunks": len(text_chunks),
166
  "chunk_size": len(chunk_text),
167
- "is_chunked": True,
168
- "materials": materials,
169
- "key_terms": key_terms,
170
- "table_summary": table_metadata.get("summary", "")
171
  })
172
 
173
- materials_str = ', '.join(materials[:10]) if materials else 'нет'
174
- terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
175
-
176
- enriched_text = f"""[Таблица {table_num}: {table_title}]
177
- [Материалы в таблице: {materials_str}]
178
- [Ключевые термины: {terms_str}]
179
-
180
- {chunk_text}"""
181
-
182
- chunked_doc = Document(
183
- text=enriched_text,
184
- metadata=chunk_metadata
185
- )
186
  chunked_docs.append(chunked_doc)
187
 
188
  return chunked_docs
@@ -222,8 +148,6 @@ def table_to_document(table_data, document_id=None):
222
  )
223
 
224
  if content_size > CHUNK_SIZE:
225
- log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
226
- f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
227
  chunked_docs = chunk_table_document(base_doc)
228
  log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
229
  for i, chunk_doc in enumerate(chunked_docs):
 
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
+ def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
36
+ """Simple table chunking: max 5 rows or 2000 chars per chunk"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
 
38
  table_num = doc.metadata.get('table_number', 'unknown')
 
 
 
39
 
40
+ # Parse table
41
  lines = doc.text.strip().split('\n')
42
 
43
  table_header_lines = []
 
56
  table_header = '\n'.join(table_header_lines) + '\n'
57
 
58
  if not data_rows:
59
+ # No rows, return as is
60
+ return [doc]
61
+
62
+ log_message(f"Таблица {table_num}: {len(data_rows)} строк")
63
+
64
+ # Simple chunking
65
+ chunks = []
66
+ current_chunk_rows = []
67
+ current_size = len(table_header)
68
+
69
+ for row in data_rows:
70
+ row_size = len(row) + 1
 
 
 
 
 
71
 
72
+ # Check if adding this row exceeds limits
73
+ if (len(current_chunk_rows) >= max_rows_per_chunk or
74
+ current_size + row_size > max_chunk_size) and current_chunk_rows:
75
 
76
+ # Save current chunk
77
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
78
+ chunks.append(chunk_text)
79
+ log_message(f" Чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # Start new chunk with overlap of 1 row
82
+ if len(current_chunk_rows) > 0:
83
+ current_chunk_rows = [current_chunk_rows[-1]]
84
+ current_size = len(table_header) + len(current_chunk_rows[0]) + 1
85
+ else:
86
+ current_chunk_rows = []
87
+ current_size = len(table_header)
88
 
89
+ current_chunk_rows.append(row)
90
+ current_size += row_size
 
 
 
91
 
92
+ # Final chunk
93
+ if current_chunk_rows:
94
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
95
+ chunks.append(chunk_text)
96
+ log_message(f" Последний чанк: {len(current_chunk_rows)} строк")
97
 
98
+ log_message(f"Таблица {table_num} разделена на {len(chunks)} чанков")
 
 
 
99
 
100
+ # Create documents
101
+ chunked_docs = []
102
+ for i, chunk_text in enumerate(chunks):
103
  chunk_metadata = doc.metadata.copy()
104
  chunk_metadata.update({
105
  "chunk_id": i,
106
+ "total_chunks": len(chunks),
107
  "chunk_size": len(chunk_text),
108
+ "is_chunked": True
 
 
 
109
  })
110
 
111
+ chunked_doc = Document(text=chunk_text, metadata=chunk_metadata)
 
 
 
 
 
 
 
 
 
 
 
 
112
  chunked_docs.append(chunked_doc)
113
 
114
  return chunked_docs
 
148
  )
149
 
150
  if content_size > CHUNK_SIZE:
 
 
151
  chunked_docs = chunk_table_document(base_doc)
152
  log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
153
  for i, chunk_doc in enumerate(chunked_docs):
utils.py CHANGED
@@ -261,41 +261,20 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
261
  try:
262
  start_time = time.time()
263
 
264
- llm = get_llm_model(current_model)
265
-
266
- # Direct retrieval without query expansion
267
  retrieved_nodes = query_engine.retriever.retrieve(question)
268
 
269
- total_retrieved = len(retrieved_nodes)
270
- log_message(f"RETRIEVED: {total_retrieved} nodes (before deduplication)")
271
 
272
  # Deduplicate
273
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
274
- duplicates_removed = total_retrieved - len(unique_retrieved)
275
- log_message(f"DEDUPLICATION: {duplicates_removed} duplicates removed")
276
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
277
-
278
-
279
- reranked_nodes = rerank_nodes(
280
- question,
281
- unique_retrieved,
282
- reranker,
283
- top_k=20,
284
- min_score_threshold=0.5,
285
- diversity_penalty=0.3
286
- )
287
 
288
- formatted_context = format_context_for_llm(reranked_nodes)
289
-
290
- enhanced_question = f"""Контекст из базы данных:
291
- {formatted_context}
292
-
293
- Вопрос пользователя: {question}
294
-
295
- Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
296
- Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
297
 
298
- response = query_engine.query(enhanced_question)
 
299
 
300
  end_time = time.time()
301
  processing_time = end_time - start_time
@@ -317,12 +296,9 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
317
  metadata = node.metadata if hasattr(node, 'metadata') else {}
318
  chunk_info.append({
319
  'document_id': metadata.get('document_id', 'unknown'),
320
- 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
321
  'section_path': metadata.get('section_path', ''),
322
  'section_text': metadata.get('section_text', ''),
323
- 'level': metadata.get('level', ''),
324
- 'parent_section': metadata.get('parent_section', ''),
325
- 'parent_title': metadata.get('parent_title', ''),
326
  'type': metadata.get('type', 'text'),
327
  'table_number': metadata.get('table_number', ''),
328
  'image_number': metadata.get('image_number', ''),
 
261
  try:
262
  start_time = time.time()
263
 
264
+ # Simple retrieval
 
 
265
  retrieved_nodes = query_engine.retriever.retrieve(question)
266
 
267
+ log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
 
268
 
269
  # Deduplicate
270
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
271
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
 
 
 
 
 
 
 
 
 
 
272
 
273
+ # Simple reranking
274
+ reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
 
 
 
 
 
 
 
275
 
276
+ # Direct query without formatting
277
+ response = query_engine.query(question)
278
 
279
  end_time = time.time()
280
  processing_time = end_time - start_time
 
296
  metadata = node.metadata if hasattr(node, 'metadata') else {}
297
  chunk_info.append({
298
  'document_id': metadata.get('document_id', 'unknown'),
299
+ 'section_id': metadata.get('section_id', 'unknown'),
300
  'section_path': metadata.get('section_path', ''),
301
  'section_text': metadata.get('section_text', ''),
 
 
 
302
  'type': metadata.get('type', 'text'),
303
  'table_number': metadata.get('table_number', ''),
304
  'image_number': metadata.get('image_number', ''),