Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 30, 2025

Commit

6c83262

1 Parent(s): 944b5ee

new ways

Browse files

Files changed (2) hide show

index_retriever.py +3 -3
utils.py +51 -73

index_retriever.py CHANGED Viewed

@@ -16,18 +16,18 @@ def create_query_engine(vector_index):
     try:
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=20
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=30,
             similarity_cutoff=0.7
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=40,
             num_queries=1
         )

     try:
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=15
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=20,
             similarity_cutoff=0.7
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=30,
             num_queries=1
         )

utils.py CHANGED Viewed

@@ -43,69 +43,6 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
-def format_context_for_llm(nodes):
-    context_parts = []
-    for node in nodes:
-        metadata = node.metadata if hasattr(node, 'metadata') else {}
-        doc_id = metadata.get('document_id', 'Неизвестный документ')
-        section_info = ""
-        if metadata.get('section_path'):
-            section_path = metadata['section_path']
-            section_text = metadata.get('section_text', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            level = metadata.get('level', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                # For subsections, show: пункт X.X в разделе X (Title)
-                section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
-            elif section_text:
-                # For main sections, show: пункт X (Title)
-                section_info = f"пункт {section_path} ({section_text})"
-            else:
-                section_info = f"пункт {section_path}"
-        elif metadata.get('section_id'):
-            section_id = metadata['section_id']
-            section_text = metadata.get('section_text', '')
-            level = metadata.get('level', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                # For subsections without section_path, show: пункт X.X в разделе X (Title)
-                section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
-            elif section_text:
-                section_info = f"пункт {section_id} ({section_text})"
-            else:
-                section_info = f"пункт {section_id}"
-        if metadata.get('type') == 'table' and metadata.get('table_number'):
-            table_num = metadata['table_number']
-            if not str(table_num).startswith('№'):
-                table_num = f"№{table_num}"
-            section_info = f"таблица {table_num}"
-        if metadata.get('type') == 'image' and metadata.get('image_number'):
-            image_num = metadata['image_number']
-            if not str(image_num).startswith('№'):
-                image_num = f"№{image_num}"
-            section_info = f"рисунок {image_num}"
-        context_text = node.text if hasattr(node, 'text') else str(node)
-        if section_info:
-            formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
-        else:
-            formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
-        context_parts.append(formatted_context)
-    return "\n".join(context_parts)
 def get_llm_model(model_name):
     try:
         model_config = AVAILABLE_MODELS.get(model_name)
@@ -148,42 +85,82 @@ def format_context_for_llm(nodes):
         section_info = ""
         if metadata.get('section_path'):
             section_path = metadata['section_path']
             section_text = metadata.get('section_text', '')
             parent_section = metadata.get('parent_section', '')
             parent_title = metadata.get('parent_title', '')
-            if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
             elif section_text:
-                section_info = f"пункт {section_path} ({section_text})"
             else:
-                section_info = f"пункт {section_path}"
         elif metadata.get('section_id'):
             section_id = metadata['section_id']
             section_text = metadata.get('section_text', '')
-            if section_text:
-                section_info = f"пункт {section_id} ({section_text})"
             else:
-                section_info = f"пункт {section_id}"
         if metadata.get('type') == 'table' and metadata.get('table_number'):
             table_num = metadata['table_number']
             if not str(table_num).startswith('№'):
                 table_num = f"№{table_num}"
-            section_info = f"таблица {table_num}"
         if metadata.get('type') == 'image' and metadata.get('image_number'):
             image_num = metadata['image_number']
             if not str(image_num).startswith('№'):
                 image_num = f"№{image_num}"
-            section_info = f"рисунок {image_num}"
         context_text = node.text if hasattr(node, 'text') else str(node)
         if section_info:
-            formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
         else:
             formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
@@ -191,6 +168,7 @@ def format_context_for_llm(nodes):
     return "\n".join(context_parts)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"

 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
 def get_llm_model(model_name):
     try:
         model_config = AVAILABLE_MODELS.get(model_name)
         section_info = ""
+        # Handle section information with proper hierarchy
         if metadata.get('section_path'):
             section_path = metadata['section_path']
             section_text = metadata.get('section_text', '')
             parent_section = metadata.get('parent_section', '')
             parent_title = metadata.get('parent_title', '')
+            level = metadata.get('level', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                # For subsections: раздел X (Title), пункт X.X
+                if section_text:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
+                else:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
             elif section_text:
+                # For main sections: раздел X (Title)
+                section_info = f"раздел {section_path} ({section_text})"
             else:
+                section_info = f"раздел {section_path}"
         elif metadata.get('section_id'):
             section_id = metadata['section_id']
             section_text = metadata.get('section_text', '')
+            level = metadata.get('level', '')
+            parent_section = metadata.get('parent_section', '')
+            parent_title = metadata.get('parent_title', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                if section_text:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
+                else:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
+            elif section_text:
+                section_info = f"раздел {section_id} ({section_text})"
             else:
+                section_info = f"раздел {section_id}"
+        # Override with table/image info if applicable
         if metadata.get('type') == 'table' and metadata.get('table_number'):
             table_num = metadata['table_number']
             if not str(table_num).startswith('№'):
                 table_num = f"№{table_num}"
+            table_title = metadata.get('table_title', '')
+            # Include section context for tables
+            base_section = ""
+            if metadata.get('section_path'):
+                base_section = f", раздел {metadata['section_path']}"
+            elif metadata.get('section_id'):
+                base_section = f", раздел {metadata['section_id']}"
+            if table_title:
+                section_info = f"Таблица {table_num} ({table_title}){base_section}"
+            else:
+                section_info = f"Таблица {table_num}{base_section}"
         if metadata.get('type') == 'image' and metadata.get('image_number'):
             image_num = metadata['image_number']
             if not str(image_num).startswith('№'):
                 image_num = f"№{image_num}"
+            image_title = metadata.get('image_title', '')
+            # Include section context for images
+            base_section = ""
+            if metadata.get('section_path'):
+                base_section = f", раздел {metadata['section_path']}"
+            elif metadata.get('section_id'):
+                base_section = f", раздел {metadata['section_id']}"
+            if image_title:
+                section_info = f"Рисунок {image_num} ({image_title}){base_section}"
+            else:
+                section_info = f"Рисунок {image_num}{base_section}"
         context_text = node.text if hasattr(node, 'text') else str(node)
         if section_info:
+            formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
         else:
             formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
     return "\n".join(context_parts)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"