MrSimple07 commited on
Commit
499b5c3
·
1 Parent(s): 59c7b5b

priority table + images

Browse files
Files changed (2) hide show
  1. index_retriever.py +24 -12
  2. utils.py +0 -90
index_retriever.py CHANGED
@@ -56,21 +56,33 @@ def rerank_nodes(query, nodes, reranker, top_k=10):
56
  try:
57
  log_message(f"Переранжирую {len(nodes)} узлов")
58
 
59
- pairs = []
60
- for node in nodes:
61
- pairs.append([query, node.text])
 
62
 
63
- scores = reranker.predict(pairs)
64
 
65
- scored_nodes = list(zip(nodes, scores))
66
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
 
 
 
 
 
 
 
 
 
 
67
 
68
- reranked_nodes = [node for node, score in scored_nodes[:top_k]]
69
- log_message(f"Возвращаю топ-{len(reranked_nodes)} переранжированных узлов")
 
 
 
 
70
 
71
- return reranked_nodes
72
  except Exception as e:
73
  log_message(f"Ошибка переранжировки: {str(e)}")
74
- return nodes[:top_k]
75
-
76
-
 
56
  try:
57
  log_message(f"Переранжирую {len(nodes)} узлов")
58
 
59
+ # Separate tables and images from text nodes
60
+ table_nodes = [node for node in nodes if node.metadata.get('type') == 'table']
61
+ image_nodes = [node for node in nodes if node.metadata.get('type') == 'image']
62
+ text_nodes = [node for node in nodes if node.metadata.get('type', 'text') == 'text']
63
 
64
+ priority_nodes = table_nodes + image_nodes
65
 
66
+ # Rerank only text nodes
67
+ if text_nodes:
68
+ pairs = []
69
+ for node in text_nodes:
70
+ pairs.append([query, node.text])
71
+
72
+ scores = reranker.predict(pairs)
73
+ scored_nodes = list(zip(text_nodes, scores))
74
+ scored_nodes.sort(key=lambda x: x[1], reverse=True)
75
+ reranked_text_nodes = [node for node, score in scored_nodes]
76
+ else:
77
+ reranked_text_nodes = []
78
 
79
+ # Combine: priority nodes first, then reranked text nodes
80
+ final_nodes = priority_nodes + reranked_text_nodes
81
+ result = final_nodes[:top_k]
82
+
83
+ log_message(f"Возвращаю {len(priority_nodes)} приоритетных узлов и {len(result) - len(priority_nodes)} текстовых узлов")
84
+ return result
85
 
 
86
  except Exception as e:
87
  log_message(f"Ошибка переранжировки: {str(e)}")
88
+ return nodes[:top_k]
 
 
utils.py CHANGED
@@ -105,95 +105,6 @@ def format_context_for_llm(nodes):
105
 
106
  return "\n".join(context_parts)
107
 
108
- def generate_sources_html(nodes, chunks_df=None):
109
- html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
110
- html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
111
-
112
- # Group nodes by document to avoid duplicates
113
- sources_by_doc = {}
114
-
115
- for i, node in enumerate(nodes):
116
- metadata = node.metadata if hasattr(node, 'metadata') else {}
117
- doc_type = metadata.get('type', 'text')
118
- doc_id = metadata.get('document_id', 'unknown')
119
- section_id = metadata.get('section_id', '')
120
- section_text = metadata.get('section_text', '')
121
- section_path = metadata.get('section_path', '')
122
-
123
- if doc_type == 'table':
124
- table_num = metadata.get('table_number', 'unknown')
125
- key = f"{doc_id}_table_{table_num}"
126
- elif doc_type == 'image':
127
- image_num = metadata.get('image_number', 'unknown')
128
- key = f"{doc_id}_image_{image_num}"
129
- else:
130
- section_key = section_path if section_path else section_id
131
- key = f"{doc_id}_text_{section_key}"
132
-
133
- if key not in sources_by_doc:
134
- sources_by_doc[key] = {
135
- 'doc_id': doc_id,
136
- 'doc_type': doc_type,
137
- 'metadata': metadata,
138
- 'sections': set()
139
- }
140
-
141
- # Add section information
142
- if section_path:
143
- sources_by_doc[key]['sections'].add(f"{section_path}")
144
- elif section_id and section_id != 'unknown':
145
- sources_by_doc[key]['sections'].add(f"{section_id}")
146
-
147
- # Generate HTML for each unique source
148
- for source_info in sources_by_doc.values():
149
- metadata = source_info['metadata']
150
- doc_type = source_info['doc_type']
151
- doc_id = source_info['doc_id']
152
-
153
- html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
154
-
155
- if doc_type == 'text':
156
- html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
157
-
158
- elif doc_type == 'table' or doc_type == 'table_row':
159
- table_num = metadata.get('table_number', 'unknown')
160
- table_title = metadata.get('table_title', '')
161
- if table_num and table_num != 'unknown':
162
- if not str(table_num).startswith('№'):
163
- table_num = f"№{table_num}"
164
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
165
- if table_title and table_title != 'unknown':
166
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
167
- else:
168
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
169
-
170
- elif doc_type == 'image':
171
- image_num = metadata.get('image_number', 'unknown')
172
- image_title = metadata.get('image_title', '')
173
- section = metadata.get('section', '')
174
- if image_num and image_num != 'unknown':
175
- if not str(image_num).startswith('№'):
176
- image_num = f"№{image_num}"
177
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
178
- if image_title and image_title != 'unknown':
179
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
180
- if section and section != 'unknown':
181
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
182
- else:
183
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
184
-
185
- # Add file link if available
186
- if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
187
- doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
188
- if not doc_rows.empty:
189
- file_link = doc_rows.iloc[0]['file_link']
190
- html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
191
-
192
- html += "</div>"
193
-
194
- html += "</div>"
195
- return html
196
-
197
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
198
  if query_engine is None:
199
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
@@ -372,7 +283,6 @@ def generate_sources_html(nodes, chunks_df=None):
372
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
373
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
374
 
375
- # Group nodes by document to avoid duplicates
376
  sources_by_doc = {}
377
 
378
  for i, node in enumerate(nodes):
 
105
 
106
  return "\n".join(context_parts)
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
109
  if query_engine is None:
110
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
 
283
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
284
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
285
 
 
286
  sources_by_doc = {}
287
 
288
  for i, node in enumerate(nodes):