Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 18, 2025

Commit

e393fbc

1 Parent(s): 19e03d0

new chunking showing + improved context giving to LLM

Browse files

Files changed (6) hide show

__pycache__/config.cpython-311.pyc +0 -0
__pycache__/index_retriever.cpython-311.pyc +0 -0
__pycache__/my_logging.cpython-311.pyc +0 -0
app.py +44 -1
index_retriever.py +1 -1
utils.py +24 -10

__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (66.4 kB). View file

__pycache__/index_retriever.cpython-311.pyc ADDED Viewed

Binary file (4.25 kB). View file

__pycache__/my_logging.cpython-311.pyc ADDED Viewed

Binary file (811 Bytes). View file

app.py CHANGED Viewed

@@ -20,10 +20,14 @@ def create_chunks_display_html(chunk_info):
     for i, chunk in enumerate(chunk_info):
         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
         html += f"""
         <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
             <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
-            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{chunk.get('section_id', 'unknown')}</span><br>
             <strong style='color: black;'>Содержание:</strong><br>
             <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
                 {chunk['chunk_text']}
@@ -34,6 +38,45 @@ def create_chunks_display_html(chunk_info):
     html += "</div>"
     return html
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):

     for i, chunk in enumerate(chunk_info):
         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        # Get section display info similar to format_context_for_llm
+        section_display = format_section_for_display(chunk)
         html += f"""
         <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
             <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
             <strong style='color: black;'>Содержание:</strong><br>
             <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
                 {chunk['chunk_text']}
     html += "</div>"
     return html
+def format_section_for_display(chunk):
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    section_text = chunk.get('section_text', '')
+    level = chunk.get('level', '')
+    parent_section = chunk.get('parent_section', '')
+    parent_title = chunk.get('parent_title', '')
+    doc_type = chunk.get('type', 'text')
+    if doc_type == 'table' and chunk.get('table_number'):
+        table_num = chunk.get('table_number')
+        if not str(table_num).startswith('№'):
+            table_num = f"№{table_num}"
+        return f"таблица {table_num}"
+    if doc_type == 'image' and chunk.get('image_number'):
+        image_num = chunk.get('image_number')
+        if not str(image_num).startswith('№'):
+            image_num = f"№{image_num}"
+        return f"рисунок {image_num}"
+    if section_path:
+        if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+            return f"пункт {section_path} в разделе {parent_section} ({parent_title})"
+        elif section_text:
+            return f"пункт {section_path} ({section_text})"
+        else:
+            return f"пункт {section_path}"
+    elif section_id and section_id != 'unknown':
+        if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+            return f"пункт {section_id} в разделе {parent_section} ({parent_title})"
+        elif section_text:
+            return f"пункт {section_id} ({section_text})"
+        else:
+            return f"пункт {section_id}"
+    return section_id
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):

index_retriever.py CHANGED Viewed

@@ -22,7 +22,7 @@ def create_query_engine(vector_index):
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
             similarity_top_k=30,
-            similarity_cutoff=0.7
         )
         hybrid_retriever = QueryFusionRetriever(

         vector_retriever = VectorIndexRetriever(
             index=vector_index,
             similarity_top_k=30,
+            similarity_cutoff=0.8
         )
         hybrid_retriever = QueryFusionRetriever(

utils.py CHANGED Viewed

@@ -57,17 +57,27 @@ def format_context_for_llm(nodes):
             section_text = metadata.get('section_text', '')
             parent_section = metadata.get('parent_section', '')
             parent_title = metadata.get('parent_title', '')
-            if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
             elif section_text:
                 section_info = f"пункт {section_path} ({section_text})"
             else:
                 section_info = f"пункт {section_path}"
         elif metadata.get('section_id'):
             section_id = metadata['section_id']
             section_text = metadata.get('section_text', '')
-            if section_text:
                 section_info = f"пункт {section_id} ({section_text})"
             else:
                 section_info = f"пункт {section_id}"
@@ -249,10 +259,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
         chunk_info = []
         for node in reranked_nodes:
-            section_id = node.metadata.get('section_id', node.metadata.get('section', 'unknown'))
             chunk_info.append({
-                'document_id': node.metadata.get('document_id', 'unknown'),
-                'section_id': section_id,
                 'chunk_size': len(node.text),
                 'chunk_text': node.text
             })
@@ -413,10 +431,6 @@ def generate_sources_html(nodes, chunks_df=None):
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
-            # Show all sections for this document
-            if source_info['sections']:
-                sections_text = ", ".join(sorted(source_info['sections']))
-                html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{sections_text}</p>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')

             section_text = metadata.get('section_text', '')
             parent_section = metadata.get('parent_section', '')
             parent_title = metadata.get('parent_title', '')
+            level = metadata.get('level', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                # For subsections, show: пункт X.X в разделе X (Title)
+                section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
             elif section_text:
+                # For main sections, show: пункт X (Title)
                 section_info = f"пункт {section_path} ({section_text})"
             else:
                 section_info = f"пункт {section_path}"
         elif metadata.get('section_id'):
             section_id = metadata['section_id']
             section_text = metadata.get('section_text', '')
+            level = metadata.get('level', '')
+            parent_section = metadata.get('parent_section', '')
+            parent_title = metadata.get('parent_title', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                # For subsections without section_path, show: пункт X.X в разделе X (Title)
+                section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
+            elif section_text:
                 section_info = f"пункт {section_id} ({section_text})"
             else:
                 section_info = f"пункт {section_id}"
         chunk_info = []
         for node in reranked_nodes:
+            metadata = node.metadata if hasattr(node, 'metadata') else {}
             chunk_info.append({
+                'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
+                'section_path': metadata.get('section_path', ''),
+                'section_text': metadata.get('section_text', ''),
+                'level': metadata.get('level', ''),
+                'parent_section': metadata.get('parent_section', ''),
+                'parent_title': metadata.get('parent_title', ''),
+                'type': metadata.get('type', 'text'),
+                'table_number': metadata.get('table_number', ''),
+                'image_number': metadata.get('image_number', ''),
                 'chunk_size': len(node.text),
                 'chunk_text': node.text
             })
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')