Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 18, 2025

Commit

499b5c3

1 Parent(s): 59c7b5b

priority table + images

Browse files

Files changed (2) hide show

index_retriever.py +24 -12
utils.py +0 -90

index_retriever.py CHANGED Viewed

@@ -56,21 +56,33 @@ def rerank_nodes(query, nodes, reranker, top_k=10):
     try:
         log_message(f"Переранжирую {len(nodes)} узлов")
-        pairs = []
-        for node in nodes:
-            pairs.append([query, node.text])
-        scores = reranker.predict(pairs)
-        scored_nodes = list(zip(nodes, scores))
-        scored_nodes.sort(key=lambda x: x[1], reverse=True)
-        reranked_nodes = [node for node, score in scored_nodes[:top_k]]
-        log_message(f"Возвращаю топ-{len(reranked_nodes)} переранжированных узлов")
-        return reranked_nodes
     except Exception as e:
         log_message(f"Ошибка переранжировки: {str(e)}")
-        return nodes[:top_k]

     try:
         log_message(f"Переранжирую {len(nodes)} узлов")
+        # Separate tables and images from text nodes
+        table_nodes = [node for node in nodes if node.metadata.get('type') == 'table']
+        image_nodes = [node for node in nodes if node.metadata.get('type') == 'image']
+        text_nodes = [node for node in nodes if node.metadata.get('type', 'text') == 'text']
+        priority_nodes = table_nodes + image_nodes
+        # Rerank only text nodes
+        if text_nodes:
+            pairs = []
+            for node in text_nodes:
+                pairs.append([query, node.text])
+            scores = reranker.predict(pairs)
+            scored_nodes = list(zip(text_nodes, scores))
+            scored_nodes.sort(key=lambda x: x[1], reverse=True)
+            reranked_text_nodes = [node for node, score in scored_nodes]
+        else:
+            reranked_text_nodes = []
+        # Combine: priority nodes first, then reranked text nodes
+        final_nodes = priority_nodes + reranked_text_nodes
+        result = final_nodes[:top_k]
+        log_message(f"Возвращаю {len(priority_nodes)} приоритетных узлов и {len(result) - len(priority_nodes)} текстовых узлов")
+        return result
     except Exception as e:
         log_message(f"Ошибка переранжировки: {str(e)}")
+        return nodes[:top_k]

utils.py CHANGED Viewed

@@ -105,95 +105,6 @@ def format_context_for_llm(nodes):
     return "\n".join(context_parts)
-def generate_sources_html(nodes, chunks_df=None):
-    html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
-    html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
-    # Group nodes by document to avoid duplicates
-    sources_by_doc = {}
-    for i, node in enumerate(nodes):
-        metadata = node.metadata if hasattr(node, 'metadata') else {}
-        doc_type = metadata.get('type', 'text')
-        doc_id = metadata.get('document_id', 'unknown')
-        section_id = metadata.get('section_id', '')
-        section_text = metadata.get('section_text', '')
-        section_path = metadata.get('section_path', '')
-        if doc_type == 'table':
-            table_num = metadata.get('table_number', 'unknown')
-            key = f"{doc_id}_table_{table_num}"
-        elif doc_type == 'image':
-            image_num = metadata.get('image_number', 'unknown')
-            key = f"{doc_id}_image_{image_num}"
-        else:
-            section_key = section_path if section_path else section_id
-            key = f"{doc_id}_text_{section_key}"
-        if key not in sources_by_doc:
-            sources_by_doc[key] = {
-                'doc_id': doc_id,
-                'doc_type': doc_type,
-                'metadata': metadata,
-                'sections': set()
-            }
-        # Add section information
-        if section_path:
-            sources_by_doc[key]['sections'].add(f"{section_path}")
-        elif section_id and section_id != 'unknown':
-            sources_by_doc[key]['sections'].add(f"{section_id}")
-    # Generate HTML for each unique source
-    for source_info in sources_by_doc.values():
-        metadata = source_info['metadata']
-        doc_type = source_info['doc_type']
-        doc_id = source_info['doc_id']
-        html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
-        if doc_type == 'text':
-            html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
-        elif doc_type == 'table' or doc_type == 'table_row':
-            table_num = metadata.get('table_number', 'unknown')
-            table_title = metadata.get('table_title', '')
-            if table_num and table_num != 'unknown':
-                if not str(table_num).startswith('№'):
-                    table_num = f"№{table_num}"
-                html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
-                if table_title and table_title != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
-            else:
-                html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
-        elif doc_type == 'image':
-            image_num = metadata.get('image_number', 'unknown')
-            image_title = metadata.get('image_title', '')
-            section = metadata.get('section', '')
-            if image_num and image_num != 'unknown':
-                if not str(image_num).startswith('№'):
-                    image_num = f"№{image_num}"
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
-                if image_title and image_title != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
-                if section and section != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
-            else:
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
-        # Add file link if available
-        if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
-            doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
-            if not doc_rows.empty:
-                file_link = doc_rows.iloc[0]['file_link']
-                html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
-        html += "</div>"
-    html += "</div>"
-    return html
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
@@ -372,7 +283,6 @@ def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
-    # Group nodes by document to avoid duplicates
     sources_by_doc = {}
     for i, node in enumerate(nodes):

     return "\n".join(context_parts)
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
     sources_by_doc = {}
     for i, node in enumerate(nodes):