Spaces:
Sleeping
Sleeping
Commit
·
499b5c3
1
Parent(s):
59c7b5b
priority table + images
Browse files- index_retriever.py +24 -12
- utils.py +0 -90
index_retriever.py
CHANGED
|
@@ -56,21 +56,33 @@ def rerank_nodes(query, nodes, reranker, top_k=10):
|
|
| 56 |
try:
|
| 57 |
log_message(f"Переранжирую {len(nodes)} узлов")
|
| 58 |
|
| 59 |
-
|
| 60 |
-
for node in nodes
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
return reranked_nodes
|
| 72 |
except Exception as e:
|
| 73 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 74 |
-
return nodes[:top_k]
|
| 75 |
-
|
| 76 |
-
|
|
|
|
| 56 |
try:
|
| 57 |
log_message(f"Переранжирую {len(nodes)} узлов")
|
| 58 |
|
| 59 |
+
# Separate tables and images from text nodes
|
| 60 |
+
table_nodes = [node for node in nodes if node.metadata.get('type') == 'table']
|
| 61 |
+
image_nodes = [node for node in nodes if node.metadata.get('type') == 'image']
|
| 62 |
+
text_nodes = [node for node in nodes if node.metadata.get('type', 'text') == 'text']
|
| 63 |
|
| 64 |
+
priority_nodes = table_nodes + image_nodes
|
| 65 |
|
| 66 |
+
# Rerank only text nodes
|
| 67 |
+
if text_nodes:
|
| 68 |
+
pairs = []
|
| 69 |
+
for node in text_nodes:
|
| 70 |
+
pairs.append([query, node.text])
|
| 71 |
+
|
| 72 |
+
scores = reranker.predict(pairs)
|
| 73 |
+
scored_nodes = list(zip(text_nodes, scores))
|
| 74 |
+
scored_nodes.sort(key=lambda x: x[1], reverse=True)
|
| 75 |
+
reranked_text_nodes = [node for node, score in scored_nodes]
|
| 76 |
+
else:
|
| 77 |
+
reranked_text_nodes = []
|
| 78 |
|
| 79 |
+
# Combine: priority nodes first, then reranked text nodes
|
| 80 |
+
final_nodes = priority_nodes + reranked_text_nodes
|
| 81 |
+
result = final_nodes[:top_k]
|
| 82 |
+
|
| 83 |
+
log_message(f"Возвращаю {len(priority_nodes)} приоритетных узлов и {len(result) - len(priority_nodes)} текстовых узлов")
|
| 84 |
+
return result
|
| 85 |
|
|
|
|
| 86 |
except Exception as e:
|
| 87 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 88 |
+
return nodes[:top_k]
|
|
|
|
|
|
utils.py
CHANGED
|
@@ -105,95 +105,6 @@ def format_context_for_llm(nodes):
|
|
| 105 |
|
| 106 |
return "\n".join(context_parts)
|
| 107 |
|
| 108 |
-
def generate_sources_html(nodes, chunks_df=None):
|
| 109 |
-
html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
|
| 110 |
-
html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
|
| 111 |
-
|
| 112 |
-
# Group nodes by document to avoid duplicates
|
| 113 |
-
sources_by_doc = {}
|
| 114 |
-
|
| 115 |
-
for i, node in enumerate(nodes):
|
| 116 |
-
metadata = node.metadata if hasattr(node, 'metadata') else {}
|
| 117 |
-
doc_type = metadata.get('type', 'text')
|
| 118 |
-
doc_id = metadata.get('document_id', 'unknown')
|
| 119 |
-
section_id = metadata.get('section_id', '')
|
| 120 |
-
section_text = metadata.get('section_text', '')
|
| 121 |
-
section_path = metadata.get('section_path', '')
|
| 122 |
-
|
| 123 |
-
if doc_type == 'table':
|
| 124 |
-
table_num = metadata.get('table_number', 'unknown')
|
| 125 |
-
key = f"{doc_id}_table_{table_num}"
|
| 126 |
-
elif doc_type == 'image':
|
| 127 |
-
image_num = metadata.get('image_number', 'unknown')
|
| 128 |
-
key = f"{doc_id}_image_{image_num}"
|
| 129 |
-
else:
|
| 130 |
-
section_key = section_path if section_path else section_id
|
| 131 |
-
key = f"{doc_id}_text_{section_key}"
|
| 132 |
-
|
| 133 |
-
if key not in sources_by_doc:
|
| 134 |
-
sources_by_doc[key] = {
|
| 135 |
-
'doc_id': doc_id,
|
| 136 |
-
'doc_type': doc_type,
|
| 137 |
-
'metadata': metadata,
|
| 138 |
-
'sections': set()
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
-
# Add section information
|
| 142 |
-
if section_path:
|
| 143 |
-
sources_by_doc[key]['sections'].add(f"{section_path}")
|
| 144 |
-
elif section_id and section_id != 'unknown':
|
| 145 |
-
sources_by_doc[key]['sections'].add(f"{section_id}")
|
| 146 |
-
|
| 147 |
-
# Generate HTML for each unique source
|
| 148 |
-
for source_info in sources_by_doc.values():
|
| 149 |
-
metadata = source_info['metadata']
|
| 150 |
-
doc_type = source_info['doc_type']
|
| 151 |
-
doc_id = source_info['doc_id']
|
| 152 |
-
|
| 153 |
-
html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
|
| 154 |
-
|
| 155 |
-
if doc_type == 'text':
|
| 156 |
-
html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
|
| 157 |
-
|
| 158 |
-
elif doc_type == 'table' or doc_type == 'table_row':
|
| 159 |
-
table_num = metadata.get('table_number', 'unknown')
|
| 160 |
-
table_title = metadata.get('table_title', '')
|
| 161 |
-
if table_num and table_num != 'unknown':
|
| 162 |
-
if not str(table_num).startswith('№'):
|
| 163 |
-
table_num = f"№{table_num}"
|
| 164 |
-
html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
|
| 165 |
-
if table_title and table_title != 'unknown':
|
| 166 |
-
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
|
| 167 |
-
else:
|
| 168 |
-
html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
|
| 169 |
-
|
| 170 |
-
elif doc_type == 'image':
|
| 171 |
-
image_num = metadata.get('image_number', 'unknown')
|
| 172 |
-
image_title = metadata.get('image_title', '')
|
| 173 |
-
section = metadata.get('section', '')
|
| 174 |
-
if image_num and image_num != 'unknown':
|
| 175 |
-
if not str(image_num).startswith('№'):
|
| 176 |
-
image_num = f"№{image_num}"
|
| 177 |
-
html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
|
| 178 |
-
if image_title and image_title != 'unknown':
|
| 179 |
-
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
|
| 180 |
-
if section and section != 'unknown':
|
| 181 |
-
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
|
| 182 |
-
else:
|
| 183 |
-
html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
|
| 184 |
-
|
| 185 |
-
# Add file link if available
|
| 186 |
-
if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
|
| 187 |
-
doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
|
| 188 |
-
if not doc_rows.empty:
|
| 189 |
-
file_link = doc_rows.iloc[0]['file_link']
|
| 190 |
-
html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
|
| 191 |
-
|
| 192 |
-
html += "</div>"
|
| 193 |
-
|
| 194 |
-
html += "</div>"
|
| 195 |
-
return html
|
| 196 |
-
|
| 197 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 198 |
if query_engine is None:
|
| 199 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
|
|
@@ -372,7 +283,6 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 372 |
html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
|
| 373 |
html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
|
| 374 |
|
| 375 |
-
# Group nodes by document to avoid duplicates
|
| 376 |
sources_by_doc = {}
|
| 377 |
|
| 378 |
for i, node in enumerate(nodes):
|
|
|
|
| 105 |
|
| 106 |
return "\n".join(context_parts)
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 109 |
if query_engine is None:
|
| 110 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
|
|
|
|
| 283 |
html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
|
| 284 |
html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
|
| 285 |
|
|
|
|
| 286 |
sources_by_doc = {}
|
| 287 |
|
| 288 |
for i, node in enumerate(nodes):
|