MrSimple07 commited on
Commit
33c996e
·
1 Parent(s): 5f6b6af

new api = retrieve chunks + some more text fixing

Browse files
Files changed (5) hide show
  1. app.py +49 -1
  2. config.py +5 -2
  3. documents_prep.py +3 -51
  4. table_prep.py +3 -12
  5. utils.py +139 -70
app.py CHANGED
@@ -248,7 +248,49 @@ def main_answer_question(question):
248
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
249
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
250
 
251
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
254
  with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
@@ -361,6 +403,9 @@ def main_switch_model(model_name):
361
 
362
  return status_message
363
 
 
 
 
364
  def main():
365
  global query_engine, chunks_df, reranker, vector_index, current_model
366
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
@@ -387,6 +432,9 @@ def main():
387
  current_model=current_model,
388
  chunk_info=chunk_info
389
  )
 
 
 
390
  demo.launch(
391
  server_name="0.0.0.0",
392
  server_port=7860,
 
248
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
249
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
250
 
251
+ def retrieve_chunks(question, top_k=20):
252
+ from index_retriever import rerank_nodes
253
+ global query_engine, reranker
254
+
255
+ if query_engine is None:
256
+ return "Система не инициализирована"
257
+
258
+ try:
259
+ retrieved_nodes = query_engine.retriever.retrieve(question)
260
+ log_message(f"Получено {len(retrieved_nodes)} узлов")
261
+
262
+ # Rerank nodes
263
+ reranked_nodes = rerank_nodes(
264
+ question,
265
+ retrieved_nodes,
266
+ reranker,
267
+ top_k=top_k,
268
+ min_score_threshold=0.5
269
+ )
270
+
271
+ chunks_data = []
272
+ for i, node in enumerate(reranked_nodes):
273
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
274
+
275
+ chunk = {
276
+ 'rank': i + 1,
277
+ 'document_id': metadata.get('document_id', 'unknown'),
278
+ 'section_id': metadata.get('section_id', ''),
279
+ 'section_path': metadata.get('section_path', ''),
280
+ 'section_text': metadata.get('section_text', ''),
281
+ 'type': metadata.get('type', 'text'),
282
+ 'table_number': metadata.get('table_number', ''),
283
+ 'image_number': metadata.get('image_number', ''),
284
+ 'text': node.text
285
+ }
286
+ chunks_data.append(chunk)
287
+
288
+ log_message(f"Возвращено {len(chunks_data)} чанков")
289
+ return chunks_data
290
+
291
+ except Exception as e:
292
+ log_message(f"Ошибка получения чанков: {str(e)}")
293
+ return f"Ошибка: {str(e)}"
294
 
295
  def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
296
  with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
 
403
 
404
  return status_message
405
 
406
+ gr.api(retrieve_chunks, api_name="retrieve_chunks")
407
+
408
+
409
  def main():
410
  global query_engine, chunks_df, reranker, vector_index, current_model
411
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
 
432
  current_model=current_model,
433
  chunk_info=chunk_info
434
  )
435
+ demo.api = "retrieve_chunks"
436
+ demo.queue()
437
+
438
  demo.launch(
439
  server_name="0.0.0.0",
440
  server_port=7860,
config.py CHANGED
@@ -49,8 +49,11 @@ AVAILABLE_MODELS = {
49
 
50
  DEFAULT_MODEL = "Gemini 2.5 Flash"
51
 
52
- CHUNK_SIZE = 2000
53
- CHUNK_OVERLAP = 256
 
 
 
54
 
55
  CUSTOM_PROMPT = """
56
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
 
49
 
50
  DEFAULT_MODEL = "Gemini 2.5 Flash"
51
 
52
+ CHUNK_SIZE = 1500
53
+ CHUNK_OVERLAP = 128
54
+
55
+ MAX_CHARS_TABLE = 2500
56
+ MAX_ROWS_TABLE = 10
57
 
58
  CUSTOM_PROMPT = """
59
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
documents_prep.py CHANGED
@@ -5,10 +5,7 @@ from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
6
  from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
-
9
- # Configuration
10
- CHUNK_SIZE = 1500
11
- CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
14
  text_splitter = SentenceSplitter(
@@ -38,8 +35,7 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
42
- """Chunk tables by content size AND row count"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
45
  table_num = table_data.get('table_number', 'unknown')
@@ -48,7 +44,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
48
 
49
  table_num_clean = str(table_num).strip()
50
 
51
- # Create section-aware identifier
52
  import re
53
  if 'приложени' in section.lower():
54
  appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
@@ -89,8 +84,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
89
 
90
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
91
  return [Document(text=content, metadata=metadata)]
92
-
93
- # Otherwise, chunk by BOTH content size AND row count
94
  chunks = []
95
  current_rows = []
96
  current_size = 0
@@ -100,7 +94,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
100
  row_text = format_single_row(row, i + 1)
101
  row_size = len(row_text)
102
 
103
- # Check BOTH limits: size AND row count
104
  should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
105
 
106
  if should_split:
@@ -203,43 +196,8 @@ def format_table_rows(rows):
203
 
204
 
205
  def format_table_footer(table_identifier, doc_id):
206
- """Format table footer"""
207
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
208
 
209
- def load_table_documents(repo_id, hf_token, table_dir):
210
- log_message("Loading tables...")
211
-
212
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
213
- table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
214
-
215
- all_chunks = []
216
- for file_path in table_files:
217
- try:
218
- local_path = hf_hub_download(
219
- repo_id=repo_id,
220
- filename=file_path,
221
- repo_type="dataset",
222
- token=hf_token
223
- )
224
-
225
- with open(local_path, 'r', encoding='utf-8') as f:
226
- data = json.load(f)
227
-
228
- file_doc_id = data.get('document_id', data.get('document', 'unknown'))
229
-
230
- for sheet in data.get('sheets', []):
231
- sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
232
-
233
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
234
- all_chunks.extend(chunks)
235
-
236
- except Exception as e:
237
- log_message(f"Error loading {file_path}: {e}")
238
-
239
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
240
- return all_chunks
241
-
242
-
243
  def load_json_documents(repo_id, hf_token, json_dir):
244
  import zipfile
245
  import tempfile
@@ -369,7 +327,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
369
  return documents
370
 
371
  def extract_sections_from_json(json_path):
372
- """Extract sections from a single JSON file"""
373
  documents = []
374
 
375
  try:
@@ -421,7 +378,6 @@ def extract_sections_from_json(json_path):
421
 
422
 
423
  def load_table_documents(repo_id, hf_token, table_dir):
424
- """Load and chunk tables"""
425
  log_message("Loading tables...")
426
 
427
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -439,15 +395,11 @@ def load_table_documents(repo_id, hf_token, table_dir):
439
 
440
  with open(local_path, 'r', encoding='utf-8') as f:
441
  data = json.load(f)
442
-
443
- # Extract file-level document_id
444
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
445
 
446
  for sheet in data.get('sheets', []):
447
- # Use sheet-level document_id if available, otherwise use file-level
448
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
449
 
450
- # CRITICAL: Pass document_id to chunk function
451
  chunks = chunk_table_by_content(sheet, sheet_doc_id)
452
  all_chunks.extend(chunks)
453
 
 
5
  from llama_index.core import Document
6
  from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
+ from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
 
 
 
9
 
10
  def chunk_text_documents(documents):
11
  text_splitter = SentenceSplitter(
 
35
  return chunked
36
 
37
 
38
+ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
 
39
  headers = table_data.get('headers', [])
40
  rows = table_data.get('data', [])
41
  table_num = table_data.get('table_number', 'unknown')
 
44
 
45
  table_num_clean = str(table_num).strip()
46
 
 
47
  import re
48
  if 'приложени' in section.lower():
49
  appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
 
84
 
85
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
86
  return [Document(text=content, metadata=metadata)]
87
+
 
88
  chunks = []
89
  current_rows = []
90
  current_size = 0
 
94
  row_text = format_single_row(row, i + 1)
95
  row_size = len(row_text)
96
 
 
97
  should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
98
 
99
  if should_split:
 
196
 
197
 
198
  def format_table_footer(table_identifier, doc_id):
 
199
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def load_json_documents(repo_id, hf_token, json_dir):
202
  import zipfile
203
  import tempfile
 
327
  return documents
328
 
329
  def extract_sections_from_json(json_path):
 
330
  documents = []
331
 
332
  try:
 
378
 
379
 
380
  def load_table_documents(repo_id, hf_token, table_dir):
 
381
  log_message("Loading tables...")
382
 
383
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
 
395
 
396
  with open(local_path, 'r', encoding='utf-8') as f:
397
  data = json.load(f)
 
 
398
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
399
 
400
  for sheet in data.get('sheets', []):
 
401
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
402
 
 
403
  chunks = chunk_table_by_content(sheet, sheet_doc_id)
404
  all_chunks.extend(chunks)
405
 
table_prep.py CHANGED
@@ -3,12 +3,10 @@ import json
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
 
6
 
7
- MAX_ROWS_PER_CHUNK = 10
8
- MAX_CHUNK_SIZE = 4000
9
 
10
  def create_table_content(table_data):
11
- """Create formatted content from table data"""
12
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
13
  table_num = table_data.get('table_number', 'Неизвестно')
14
  table_title = table_data.get('table_title', 'Неизвестно')
@@ -32,10 +30,9 @@ def create_table_content(table_data):
32
 
33
  return content
34
 
35
- def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=MAX_ROWS_PER_CHUNK):
36
  lines = doc.text.strip().split('\n')
37
 
38
- # Separate header and data rows
39
  header_lines = []
40
  data_rows = []
41
  in_data = False
@@ -99,8 +96,6 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=
99
 
100
 
101
  def table_to_document(table_data, document_id=None):
102
- """Convert table data to Document, chunk if needed"""
103
-
104
  if not isinstance(table_data, dict):
105
  return []
106
 
@@ -146,11 +141,7 @@ def table_to_document(table_data, document_id=None):
146
  return [base_doc]
147
 
148
 
149
- def load_table_data(repo_id, hf_token, table_data_dir):
150
- log_message("=" * 60)
151
- log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
152
- log_message("=" * 60)
153
-
154
  try:
155
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
156
  table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
 
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
+ from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
7
 
 
 
8
 
9
  def create_table_content(table_data):
 
10
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
11
  table_num = table_data.get('table_number', 'Неизвестно')
12
  table_title = table_data.get('table_title', 'Неизвестно')
 
30
 
31
  return content
32
 
33
+ def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
34
  lines = doc.text.strip().split('\n')
35
 
 
36
  header_lines = []
37
  data_rows = []
38
  in_data = False
 
96
 
97
 
98
  def table_to_document(table_data, document_id=None):
 
 
99
  if not isinstance(table_data, dict):
100
  return []
101
 
 
141
  return [base_doc]
142
 
143
 
144
+ def load_table_data(repo_id, hf_token, table_data_dir):
 
 
 
 
145
  try:
146
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
147
  table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
utils.py CHANGED
@@ -43,6 +43,99 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def generate_sources_html(nodes, chunks_df=None):
47
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
48
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
@@ -53,16 +146,19 @@ def generate_sources_html(nodes, chunks_df=None):
53
  metadata = node.metadata if hasattr(node, 'metadata') else {}
54
  doc_type = metadata.get('type', 'text')
55
  doc_id = metadata.get('document_id', 'unknown')
 
 
 
56
 
57
- if doc_type == 'table' or doc_type == 'table_row':
 
58
  table_num = metadata.get('table_number', 'unknown')
59
  key = f"{doc_id}_table_{table_num}"
60
  elif doc_type == 'image':
61
  image_num = metadata.get('image_number', 'unknown')
62
  key = f"{doc_id}_image_{image_num}"
63
  else:
64
- section_path = metadata.get('section_path', '')
65
- section_id = metadata.get('section_id', '')
66
  section_key = section_path if section_path else section_id
67
  key = f"{doc_id}_text_{section_key}"
68
 
@@ -74,14 +170,13 @@ def generate_sources_html(nodes, chunks_df=None):
74
  'sections': set()
75
  }
76
 
77
- if doc_type not in ['table', 'table_row', 'image']:
78
- section_path = metadata.get('section_path', '')
79
- section_id = metadata.get('section_id', '')
80
- if section_path:
81
- sources_by_doc[key]['sections'].add(f"пункт {section_path}")
82
- elif section_id and section_id != 'unknown':
83
- sources_by_doc[key]['sections'].add(f"пункт {section_id}")
84
 
 
85
  for source_info in sources_by_doc.values():
86
  metadata = source_info['metadata']
87
  doc_type = source_info['doc_type']
@@ -91,6 +186,7 @@ def generate_sources_html(nodes, chunks_df=None):
91
 
92
  if doc_type == 'text':
93
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
 
94
  elif doc_type == 'table' or doc_type == 'table_row':
95
  table_num = metadata.get('table_number', 'unknown')
96
  table_title = metadata.get('table_title', '')
@@ -102,16 +198,23 @@ def generate_sources_html(nodes, chunks_df=None):
102
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
103
  else:
104
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
 
105
  elif doc_type == 'image':
106
  image_num = metadata.get('image_number', 'unknown')
107
  image_title = metadata.get('image_title', '')
 
108
  if image_num and image_num != 'unknown':
109
  if not str(image_num).startswith('№'):
110
  image_num = f"№{image_num}"
111
  html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
112
  if image_title and image_title != 'unknown':
113
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
 
 
 
 
114
 
 
115
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
116
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
117
  if not doc_rows.empty:
@@ -123,56 +226,6 @@ def generate_sources_html(nodes, chunks_df=None):
123
  html += "</div>"
124
  return html
125
 
126
- def deduplicate_nodes(nodes):
127
- """Deduplicate retrieved nodes based on content and metadata"""
128
- seen = set()
129
- unique_nodes = []
130
-
131
- for node in nodes:
132
- doc_id = node.metadata.get('document_id', '')
133
- node_type = node.metadata.get('type', 'text')
134
-
135
- if node_type == 'table' or node_type == 'table_row':
136
- table_num = node.metadata.get('table_number', '')
137
- table_identifier = node.metadata.get('table_identifier', table_num)
138
-
139
- # Use row range to distinguish table chunks
140
- row_start = node.metadata.get('row_start', '')
141
- row_end = node.metadata.get('row_end', '')
142
- is_complete = node.metadata.get('is_complete_table', False)
143
-
144
- if is_complete:
145
- identifier = f"{doc_id}|table|{table_identifier}|complete"
146
- elif row_start != '' and row_end != '':
147
- identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
148
- else:
149
- # Fallback: use chunk_id if available
150
- chunk_id = node.metadata.get('chunk_id', '')
151
- if chunk_id != '':
152
- identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
153
- else:
154
- # Last resort: hash first 100 chars of content
155
- import hashlib
156
- content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
157
- identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
158
-
159
- elif node_type == 'image':
160
- img_num = node.metadata.get('image_number', '')
161
- identifier = f"{doc_id}|image|{img_num}"
162
-
163
- else: # text
164
- section_id = node.metadata.get('section_id', '')
165
- chunk_id = node.metadata.get('chunk_id', 0)
166
- # For text, section_id + chunk_id should be unique
167
- identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
168
-
169
- if identifier not in seen:
170
- seen.add(identifier)
171
- unique_nodes.append(node)
172
-
173
- return unique_nodes
174
-
175
-
176
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
177
  if query_engine is None:
178
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
@@ -180,20 +233,33 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
180
  try:
181
  start_time = time.time()
182
 
183
- # Simple retrieval
 
 
184
  retrieved_nodes = query_engine.retriever.retrieve(question)
185
 
186
- log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
 
 
 
 
 
 
 
 
 
187
 
188
- # Deduplicate
189
- unique_retrieved = deduplicate_nodes(retrieved_nodes)
190
- log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
191
 
192
- # Simple reranking
193
- reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
 
 
 
 
 
194
 
195
- # Direct query without formatting
196
- response = query_engine.query(question)
197
 
198
  end_time = time.time()
199
  processing_time = end_time - start_time
@@ -215,9 +281,12 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
215
  metadata = node.metadata if hasattr(node, 'metadata') else {}
216
  chunk_info.append({
217
  'document_id': metadata.get('document_id', 'unknown'),
218
- 'section_id': metadata.get('section_id', 'unknown'),
219
  'section_path': metadata.get('section_path', ''),
220
  'section_text': metadata.get('section_text', ''),
 
 
 
221
  'type': metadata.get('type', 'text'),
222
  'table_number': metadata.get('table_number', ''),
223
  'image_number': metadata.get('image_number', ''),
 
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
46
+ def format_context_for_llm(nodes):
47
+ context_parts = []
48
+
49
+ for node in nodes:
50
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
51
+ doc_id = metadata.get('document_id', 'Неизвестный документ')
52
+
53
+ section_info = ""
54
+
55
+ # Handle section information with proper hierarchy
56
+ if metadata.get('section_path'):
57
+ section_path = metadata['section_path']
58
+ section_text = metadata.get('section_text', '')
59
+ parent_section = metadata.get('parent_section', '')
60
+ parent_title = metadata.get('parent_title', '')
61
+ level = metadata.get('level', '')
62
+
63
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
64
+ # For subsections: раздел X (Title), пункт X.X
65
+ if section_text:
66
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
67
+ else:
68
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
69
+ elif section_text:
70
+ # For main sections: раздел X (Title)
71
+ section_info = f"раздел {section_path} ({section_text})"
72
+ else:
73
+ section_info = f"раздел {section_path}"
74
+
75
+ elif metadata.get('section_id'):
76
+ section_id = metadata['section_id']
77
+ section_text = metadata.get('section_text', '')
78
+ level = metadata.get('level', '')
79
+ parent_section = metadata.get('parent_section', '')
80
+ parent_title = metadata.get('parent_title', '')
81
+
82
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
83
+ if section_text:
84
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
85
+ else:
86
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
87
+ elif section_text:
88
+ section_info = f"раздел {section_id} ({section_text})"
89
+ else:
90
+ section_info = f"раздел {section_id}"
91
+
92
+ # Override with table/image info if applicable
93
+ if metadata.get('type') == 'table' and metadata.get('table_number'):
94
+ table_num = metadata['table_number']
95
+ if not str(table_num).startswith('№'):
96
+ table_num = f"№{table_num}"
97
+ table_title = metadata.get('table_title', '')
98
+ # Include section context for tables
99
+ base_section = ""
100
+ if metadata.get('section_path'):
101
+ base_section = f", раздел {metadata['section_path']}"
102
+ elif metadata.get('section_id'):
103
+ base_section = f", раздел {metadata['section_id']}"
104
+
105
+ if table_title:
106
+ section_info = f"Таблица {table_num} ({table_title}){base_section}"
107
+ else:
108
+ section_info = f"Таблица {table_num}{base_section}"
109
+
110
+ if metadata.get('type') == 'image' and metadata.get('image_number'):
111
+ image_num = metadata['image_number']
112
+ if not str(image_num).startswith('№'):
113
+ image_num = f"№{image_num}"
114
+ image_title = metadata.get('image_title', '')
115
+ # Include section context for images
116
+ base_section = ""
117
+ if metadata.get('section_path'):
118
+ base_section = f", раздел {metadata['section_path']}"
119
+ elif metadata.get('section_id'):
120
+ base_section = f", раздел {metadata['section_id']}"
121
+
122
+ if image_title:
123
+ section_info = f"Рисунок {image_num} ({image_title}){base_section}"
124
+ else:
125
+ section_info = f"Рисунок {image_num}{base_section}"
126
+
127
+ context_text = node.text if hasattr(node, 'text') else str(node)
128
+
129
+ if section_info:
130
+ formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
131
+ else:
132
+ formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
133
+
134
+ context_parts.append(formatted_context)
135
+
136
+ return "\n".join(context_parts)
137
+
138
+
139
  def generate_sources_html(nodes, chunks_df=None):
140
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
141
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
 
146
  metadata = node.metadata if hasattr(node, 'metadata') else {}
147
  doc_type = metadata.get('type', 'text')
148
  doc_id = metadata.get('document_id', 'unknown')
149
+ section_id = metadata.get('section_id', '')
150
+ section_text = metadata.get('section_text', '')
151
+ section_path = metadata.get('section_path', '')
152
 
153
+ # Create a unique key for grouping
154
+ if doc_type == 'table':
155
  table_num = metadata.get('table_number', 'unknown')
156
  key = f"{doc_id}_table_{table_num}"
157
  elif doc_type == 'image':
158
  image_num = metadata.get('image_number', 'unknown')
159
  key = f"{doc_id}_image_{image_num}"
160
  else:
161
+ # For text documents, group by section path or section id
 
162
  section_key = section_path if section_path else section_id
163
  key = f"{doc_id}_text_{section_key}"
164
 
 
170
  'sections': set()
171
  }
172
 
173
+ # Add section information
174
+ if section_path:
175
+ sources_by_doc[key]['sections'].add(f"пункт {section_path}")
176
+ elif section_id and section_id != 'unknown':
177
+ sources_by_doc[key]['sections'].add(f"пункт {section_id}")
 
 
178
 
179
+ # Generate HTML for each unique source
180
  for source_info in sources_by_doc.values():
181
  metadata = source_info['metadata']
182
  doc_type = source_info['doc_type']
 
186
 
187
  if doc_type == 'text':
188
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
189
+
190
  elif doc_type == 'table' or doc_type == 'table_row':
191
  table_num = metadata.get('table_number', 'unknown')
192
  table_title = metadata.get('table_title', '')
 
198
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
199
  else:
200
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
201
+
202
  elif doc_type == 'image':
203
  image_num = metadata.get('image_number', 'unknown')
204
  image_title = metadata.get('image_title', '')
205
+ section = metadata.get('section', '')
206
  if image_num and image_num != 'unknown':
207
  if not str(image_num).startswith('№'):
208
  image_num = f"№{image_num}"
209
  html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
210
  if image_title and image_title != 'unknown':
211
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
212
+ if section and section != 'unknown':
213
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
214
+ else:
215
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
216
 
217
+ # Add file link if available
218
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
219
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
220
  if not doc_rows.empty:
 
226
  html += "</div>"
227
  return html
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
230
  if query_engine is None:
231
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
 
233
  try:
234
  start_time = time.time()
235
 
236
+ llm = get_llm_model(current_model)
237
+
238
+ # Direct retrieval without query expansion
239
  retrieved_nodes = query_engine.retriever.retrieve(question)
240
 
241
+ log_message(f"Получено {len(retrieved_nodes)} узлов")
242
+
243
+ reranked_nodes = rerank_nodes(
244
+ question,
245
+ retrieved_nodes,
246
+ reranker,
247
+ top_k=40,
248
+ min_score_threshold=0.5,
249
+ diversity_penalty=0.3
250
+ )
251
 
252
+ formatted_context = format_context_for_llm(reranked_nodes)
 
 
253
 
254
+ enhanced_question = f"""Контекст из базы данных:
255
+ {formatted_context}
256
+
257
+ Вопрос пользователя: {question}
258
+
259
+ Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
260
+ Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
261
 
262
+ response = query_engine.query(enhanced_question)
 
263
 
264
  end_time = time.time()
265
  processing_time = end_time - start_time
 
281
  metadata = node.metadata if hasattr(node, 'metadata') else {}
282
  chunk_info.append({
283
  'document_id': metadata.get('document_id', 'unknown'),
284
+ 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
285
  'section_path': metadata.get('section_path', ''),
286
  'section_text': metadata.get('section_text', ''),
287
+ 'level': metadata.get('level', ''),
288
+ 'parent_section': metadata.get('parent_section', ''),
289
+ 'parent_title': metadata.get('parent_title', ''),
290
  'type': metadata.get('type', 'text'),
291
  'table_number': metadata.get('table_number', ''),
292
  'image_number': metadata.get('image_number', ''),