MrSimple07 commited on
Commit
566457a
·
1 Parent(s): 09d215a

new rag with max chunk size + api for chunks

Browse files
Files changed (3) hide show
  1. app.py +48 -0
  2. table_prep.py +3 -12
  3. utils.py +69 -140
app.py CHANGED
@@ -248,10 +248,52 @@ def main_answer_question(question):
248
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
249
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
250
 
 
 
 
251
 
 
 
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
254
  with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
 
255
 
256
  gr.Markdown("""
257
  # AIEXP - Artificial Intelligence Expert
@@ -361,6 +403,9 @@ def main_switch_model(model_name):
361
 
362
  return status_message
363
 
 
 
 
364
  def main():
365
  global query_engine, chunks_df, reranker, vector_index, current_model
366
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
@@ -387,6 +432,9 @@ def main():
387
  current_model=current_model,
388
  chunk_info=chunk_info
389
  )
 
 
 
390
  demo.launch(
391
  server_name="0.0.0.0",
392
  server_port=7860,
 
248
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
249
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
250
 
251
+ def retrieve_chunks(question: str, top_k: int = 20) -> list:
252
+ from index_retriever import rerank_nodes
253
+ global query_engine, reranker
254
 
255
+ if query_engine is None:
256
+ return []
257
 
258
+ try:
259
+ retrieved_nodes = query_engine.retriever.retrieve(question)
260
+ log_message(f"Получено {len(retrieved_nodes)} узлов")
261
+
262
+ reranked_nodes = rerank_nodes(
263
+ question,
264
+ retrieved_nodes,
265
+ reranker,
266
+ top_k=top_k,
267
+ min_score_threshold=0.5
268
+ )
269
+
270
+ chunks_data = []
271
+ for i, node in enumerate(reranked_nodes):
272
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
273
+ chunk = {
274
+ 'rank': i + 1,
275
+ 'document_id': metadata.get('document_id', 'unknown'),
276
+ 'section_id': metadata.get('section_id', ''),
277
+ 'section_path': metadata.get('section_path', ''),
278
+ 'section_text': metadata.get('section_text', ''),
279
+ 'type': metadata.get('type', 'text'),
280
+ 'table_number': metadata.get('table_number', ''),
281
+ 'image_number': metadata.get('image_number', ''),
282
+ 'text': node.text
283
+ }
284
+ chunks_data.append(chunk)
285
+
286
+ log_message(f"Возвращено {len(chunks_data)} чанков")
287
+ return chunks_data
288
+
289
+ except Exception as e:
290
+ log_message(f"Ошибка получения чанков: {str(e)}")
291
+ return []
292
+
293
+
294
  def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
295
  with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
296
+ gr.api(retrieve_chunks, api_name="retrieve_chunks")
297
 
298
  gr.Markdown("""
299
  # AIEXP - Artificial Intelligence Expert
 
403
 
404
  return status_message
405
 
406
+
407
+
408
+
409
  def main():
410
  global query_engine, chunks_df, reranker, vector_index, current_model
411
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
 
432
  current_model=current_model,
433
  chunk_info=chunk_info
434
  )
435
+ demo.api = "retrieve_chunks"
436
+ demo.queue()
437
+
438
  demo.launch(
439
  server_name="0.0.0.0",
440
  server_port=7860,
table_prep.py CHANGED
@@ -3,12 +3,10 @@ import json
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
 
6
 
7
- MAX_ROWS_PER_CHUNK = 10
8
- MAX_CHUNK_SIZE = 4000
9
 
10
  def create_table_content(table_data):
11
- """Create formatted content from table data"""
12
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
13
  table_num = table_data.get('table_number', 'Неизвестно')
14
  table_title = table_data.get('table_title', 'Неизвестно')
@@ -32,10 +30,9 @@ def create_table_content(table_data):
32
 
33
  return content
34
 
35
- def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=MAX_ROWS_PER_CHUNK):
36
  lines = doc.text.strip().split('\n')
37
 
38
- # Separate header and data rows
39
  header_lines = []
40
  data_rows = []
41
  in_data = False
@@ -99,8 +96,6 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=
99
 
100
 
101
  def table_to_document(table_data, document_id=None):
102
- """Convert table data to Document, chunk if needed"""
103
-
104
  if not isinstance(table_data, dict):
105
  return []
106
 
@@ -146,11 +141,7 @@ def table_to_document(table_data, document_id=None):
146
  return [base_doc]
147
 
148
 
149
- def load_table_data(repo_id, hf_token, table_data_dir):
150
- log_message("=" * 60)
151
- log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
152
- log_message("=" * 60)
153
-
154
  try:
155
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
156
  table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
 
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
+ from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
7
 
 
 
8
 
9
  def create_table_content(table_data):
 
10
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
11
  table_num = table_data.get('table_number', 'Неизвестно')
12
  table_title = table_data.get('table_title', 'Неизвестно')
 
30
 
31
  return content
32
 
33
+ def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
34
  lines = doc.text.strip().split('\n')
35
 
 
36
  header_lines = []
37
  data_rows = []
38
  in_data = False
 
96
 
97
 
98
  def table_to_document(table_data, document_id=None):
 
 
99
  if not isinstance(table_data, dict):
100
  return []
101
 
 
141
  return [base_doc]
142
 
143
 
144
+ def load_table_data(repo_id, hf_token, table_data_dir):
 
 
 
 
145
  try:
146
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
147
  table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
utils.py CHANGED
@@ -43,99 +43,6 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
46
- def format_context_for_llm(nodes):
47
- context_parts = []
48
-
49
- for node in nodes:
50
- metadata = node.metadata if hasattr(node, 'metadata') else {}
51
- doc_id = metadata.get('document_id', 'Неизвестный документ')
52
-
53
- section_info = ""
54
-
55
- # Handle section information with proper hierarchy
56
- if metadata.get('section_path'):
57
- section_path = metadata['section_path']
58
- section_text = metadata.get('section_text', '')
59
- parent_section = metadata.get('parent_section', '')
60
- parent_title = metadata.get('parent_title', '')
61
- level = metadata.get('level', '')
62
-
63
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
64
- # For subsections: раздел X (Title), пункт X.X
65
- if section_text:
66
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
67
- else:
68
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
69
- elif section_text:
70
- # For main sections: раздел X (Title)
71
- section_info = f"раздел {section_path} ({section_text})"
72
- else:
73
- section_info = f"раздел {section_path}"
74
-
75
- elif metadata.get('section_id'):
76
- section_id = metadata['section_id']
77
- section_text = metadata.get('section_text', '')
78
- level = metadata.get('level', '')
79
- parent_section = metadata.get('parent_section', '')
80
- parent_title = metadata.get('parent_title', '')
81
-
82
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
83
- if section_text:
84
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
85
- else:
86
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
87
- elif section_text:
88
- section_info = f"раздел {section_id} ({section_text})"
89
- else:
90
- section_info = f"раздел {section_id}"
91
-
92
- # Override with table/image info if applicable
93
- if metadata.get('type') == 'table' and metadata.get('table_number'):
94
- table_num = metadata['table_number']
95
- if not str(table_num).startswith('№'):
96
- table_num = f"№{table_num}"
97
- table_title = metadata.get('table_title', '')
98
- # Include section context for tables
99
- base_section = ""
100
- if metadata.get('section_path'):
101
- base_section = f", раздел {metadata['section_path']}"
102
- elif metadata.get('section_id'):
103
- base_section = f", раздел {metadata['section_id']}"
104
-
105
- if table_title:
106
- section_info = f"Таблица {table_num} ({table_title}){base_section}"
107
- else:
108
- section_info = f"Таблица {table_num}{base_section}"
109
-
110
- if metadata.get('type') == 'image' and metadata.get('image_number'):
111
- image_num = metadata['image_number']
112
- if not str(image_num).startswith('№'):
113
- image_num = f"№{image_num}"
114
- image_title = metadata.get('image_title', '')
115
- # Include section context for images
116
- base_section = ""
117
- if metadata.get('section_path'):
118
- base_section = f", раздел {metadata['section_path']}"
119
- elif metadata.get('section_id'):
120
- base_section = f", раздел {metadata['section_id']}"
121
-
122
- if image_title:
123
- section_info = f"Рисунок {image_num} ({image_title}){base_section}"
124
- else:
125
- section_info = f"Рисунок {image_num}{base_section}"
126
-
127
- context_text = node.text if hasattr(node, 'text') else str(node)
128
-
129
- if section_info:
130
- formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
131
- else:
132
- formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
133
-
134
- context_parts.append(formatted_context)
135
-
136
- return "\n".join(context_parts)
137
-
138
-
139
  def generate_sources_html(nodes, chunks_df=None):
140
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
141
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
@@ -146,19 +53,16 @@ def generate_sources_html(nodes, chunks_df=None):
146
  metadata = node.metadata if hasattr(node, 'metadata') else {}
147
  doc_type = metadata.get('type', 'text')
148
  doc_id = metadata.get('document_id', 'unknown')
149
- section_id = metadata.get('section_id', '')
150
- section_text = metadata.get('section_text', '')
151
- section_path = metadata.get('section_path', '')
152
 
153
- # Create a unique key for grouping
154
- if doc_type == 'table':
155
  table_num = metadata.get('table_number', 'unknown')
156
  key = f"{doc_id}_table_{table_num}"
157
  elif doc_type == 'image':
158
  image_num = metadata.get('image_number', 'unknown')
159
  key = f"{doc_id}_image_{image_num}"
160
  else:
161
- # For text documents, group by section path or section id
 
162
  section_key = section_path if section_path else section_id
163
  key = f"{doc_id}_text_{section_key}"
164
 
@@ -170,13 +74,14 @@ def generate_sources_html(nodes, chunks_df=None):
170
  'sections': set()
171
  }
172
 
173
- # Add section information
174
- if section_path:
175
- sources_by_doc[key]['sections'].add(f"пункт {section_path}")
176
- elif section_id and section_id != 'unknown':
177
- sources_by_doc[key]['sections'].add(f"пункт {section_id}")
 
 
178
 
179
- # Generate HTML for each unique source
180
  for source_info in sources_by_doc.values():
181
  metadata = source_info['metadata']
182
  doc_type = source_info['doc_type']
@@ -186,7 +91,6 @@ def generate_sources_html(nodes, chunks_df=None):
186
 
187
  if doc_type == 'text':
188
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
189
-
190
  elif doc_type == 'table' or doc_type == 'table_row':
191
  table_num = metadata.get('table_number', 'unknown')
192
  table_title = metadata.get('table_title', '')
@@ -198,23 +102,16 @@ def generate_sources_html(nodes, chunks_df=None):
198
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
199
  else:
200
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
201
-
202
  elif doc_type == 'image':
203
  image_num = metadata.get('image_number', 'unknown')
204
  image_title = metadata.get('image_title', '')
205
- section = metadata.get('section', '')
206
  if image_num and image_num != 'unknown':
207
  if not str(image_num).startswith('№'):
208
  image_num = f"№{image_num}"
209
  html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
210
  if image_title and image_title != 'unknown':
211
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
212
- if section and section != 'unknown':
213
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
214
- else:
215
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
216
 
217
- # Add file link if available
218
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
219
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
220
  if not doc_rows.empty:
@@ -225,40 +122,75 @@ def generate_sources_html(nodes, chunks_df=None):
225
 
226
  html += "</div>"
227
  return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
229
  if query_engine is None:
230
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
231
 
232
  try:
233
  start_time = time.time()
234
-
235
- llm = get_llm_model(current_model)
236
-
237
- # Direct retrieval without query expansion
238
  retrieved_nodes = query_engine.retriever.retrieve(question)
239
 
240
- log_message(f"Получено {len(retrieved_nodes)} узлов")
241
-
242
- reranked_nodes = rerank_nodes(
243
- question,
244
- retrieved_nodes,
245
- reranker,
246
- top_k=40,
247
- min_score_threshold=0.5,
248
- diversity_penalty=0.3
249
- )
250
 
251
- formatted_context = format_context_for_llm(reranked_nodes)
 
252
 
253
- enhanced_question = f"""Контекст из базы данных:
254
- {formatted_context}
255
-
256
- Вопрос пользователя: {question}
257
-
258
- Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
259
- Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
260
 
261
- response = query_engine.query(enhanced_question)
 
262
 
263
  end_time = time.time()
264
  processing_time = end_time - start_time
@@ -280,12 +212,9 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
280
  metadata = node.metadata if hasattr(node, 'metadata') else {}
281
  chunk_info.append({
282
  'document_id': metadata.get('document_id', 'unknown'),
283
- 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
284
  'section_path': metadata.get('section_path', ''),
285
  'section_text': metadata.get('section_text', ''),
286
- 'level': metadata.get('level', ''),
287
- 'parent_section': metadata.get('parent_section', ''),
288
- 'parent_title': metadata.get('parent_title', ''),
289
  'type': metadata.get('type', 'text'),
290
  'table_number': metadata.get('table_number', ''),
291
  'image_number': metadata.get('image_number', ''),
 
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def generate_sources_html(nodes, chunks_df=None):
47
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
48
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
 
53
  metadata = node.metadata if hasattr(node, 'metadata') else {}
54
  doc_type = metadata.get('type', 'text')
55
  doc_id = metadata.get('document_id', 'unknown')
 
 
 
56
 
57
+ if doc_type == 'table' or doc_type == 'table_row':
 
58
  table_num = metadata.get('table_number', 'unknown')
59
  key = f"{doc_id}_table_{table_num}"
60
  elif doc_type == 'image':
61
  image_num = metadata.get('image_number', 'unknown')
62
  key = f"{doc_id}_image_{image_num}"
63
  else:
64
+ section_path = metadata.get('section_path', '')
65
+ section_id = metadata.get('section_id', '')
66
  section_key = section_path if section_path else section_id
67
  key = f"{doc_id}_text_{section_key}"
68
 
 
74
  'sections': set()
75
  }
76
 
77
+ if doc_type not in ['table', 'table_row', 'image']:
78
+ section_path = metadata.get('section_path', '')
79
+ section_id = metadata.get('section_id', '')
80
+ if section_path:
81
+ sources_by_doc[key]['sections'].add(f"пункт {section_path}")
82
+ elif section_id and section_id != 'unknown':
83
+ sources_by_doc[key]['sections'].add(f"пункт {section_id}")
84
 
 
85
  for source_info in sources_by_doc.values():
86
  metadata = source_info['metadata']
87
  doc_type = source_info['doc_type']
 
91
 
92
  if doc_type == 'text':
93
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
 
94
  elif doc_type == 'table' or doc_type == 'table_row':
95
  table_num = metadata.get('table_number', 'unknown')
96
  table_title = metadata.get('table_title', '')
 
102
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
103
  else:
104
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
 
105
  elif doc_type == 'image':
106
  image_num = metadata.get('image_number', 'unknown')
107
  image_title = metadata.get('image_title', '')
 
108
  if image_num and image_num != 'unknown':
109
  if not str(image_num).startswith('№'):
110
  image_num = f"№{image_num}"
111
  html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
112
  if image_title and image_title != 'unknown':
113
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
 
 
 
 
114
 
 
115
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
116
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
117
  if not doc_rows.empty:
 
122
 
123
  html += "</div>"
124
  return html
125
+
126
+ def deduplicate_nodes(nodes):
127
+ """Deduplicate retrieved nodes based on content and metadata"""
128
+ seen = set()
129
+ unique_nodes = []
130
+
131
+ for node in nodes:
132
+ doc_id = node.metadata.get('document_id', '')
133
+ node_type = node.metadata.get('type', 'text')
134
+
135
+ if node_type == 'table' or node_type == 'table_row':
136
+ table_num = node.metadata.get('table_number', '')
137
+ table_identifier = node.metadata.get('table_identifier', table_num)
138
+
139
+ # Use row range to distinguish table chunks
140
+ row_start = node.metadata.get('row_start', '')
141
+ row_end = node.metadata.get('row_end', '')
142
+ is_complete = node.metadata.get('is_complete_table', False)
143
+
144
+ if is_complete:
145
+ identifier = f"{doc_id}|table|{table_identifier}|complete"
146
+ elif row_start != '' and row_end != '':
147
+ identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
148
+ else:
149
+ # Fallback: use chunk_id if available
150
+ chunk_id = node.metadata.get('chunk_id', '')
151
+ if chunk_id != '':
152
+ identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
153
+ else:
154
+ # Last resort: hash first 100 chars of content
155
+ import hashlib
156
+ content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
157
+ identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
158
+
159
+ elif node_type == 'image':
160
+ img_num = node.metadata.get('image_number', '')
161
+ identifier = f"{doc_id}|image|{img_num}"
162
+
163
+ else: # text
164
+ section_id = node.metadata.get('section_id', '')
165
+ chunk_id = node.metadata.get('chunk_id', 0)
166
+ # For text, section_id + chunk_id should be unique
167
+ identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
168
+
169
+ if identifier not in seen:
170
+ seen.add(identifier)
171
+ unique_nodes.append(node)
172
+
173
+ return unique_nodes
174
+
175
+
176
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
177
  if query_engine is None:
178
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
179
 
180
  try:
181
  start_time = time.time()
 
 
 
 
182
  retrieved_nodes = query_engine.retriever.retrieve(question)
183
 
184
+ log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
 
 
 
 
 
 
 
 
 
185
 
186
+ unique_retrieved = deduplicate_nodes(retrieved_nodes)
187
+ log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
188
 
189
+ # Simple reranking
190
+ reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
 
 
 
 
 
191
 
192
+ # Direct query without formatting
193
+ response = query_engine.query(question)
194
 
195
  end_time = time.time()
196
  processing_time = end_time - start_time
 
212
  metadata = node.metadata if hasattr(node, 'metadata') else {}
213
  chunk_info.append({
214
  'document_id': metadata.get('document_id', 'unknown'),
215
+ 'section_id': metadata.get('section_id', 'unknown'),
216
  'section_path': metadata.get('section_path', ''),
217
  'section_text': metadata.get('section_text', ''),
 
 
 
218
  'type': metadata.get('type', 'text'),
219
  'table_number': metadata.get('table_number', ''),
220
  'image_number': metadata.get('image_number', ''),