Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

63ebb90

1 Parent(s): 38ed4e9

new documents prep

Browse files

Files changed (3) hide show

app.py +33 -4
documents_prep.py +2 -20
utils.py +10 -20

app.py CHANGED Viewed

@@ -11,17 +11,46 @@ from config import (
     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
 def create_chunks_display_html(chunk_info):
     if not chunk_info:
         return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
     html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
-    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
-    for i, chunk in enumerate(chunk_info):
         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
-        # Get section display info
         section_display = get_section_display(chunk)
         formatted_content = get_formatted_content(chunk)

     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
+def merge_table_chunks(chunk_info):
+    merged = {}
+    for chunk in chunk_info:
+        doc_type = chunk.get('type', 'text')
+        doc_id = chunk.get('document_id', 'unknown')
+        if doc_type == 'table' or doc_type == 'table_row':
+            table_num = chunk.get('table_number', '')
+            key = f"{doc_id}_{table_num}"
+            if key not in merged:
+                merged[key] = {
+                    'document_id': doc_id,
+                    'type': 'table',
+                    'table_number': table_num,
+                    'section_id': chunk.get('section_id', 'unknown'),
+                    'chunk_text': chunk.get('chunk_text', '')
+                }
+            else:
+                merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
+        else:
+            unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
+            merged[unique_key] = chunk
+    return list(merged.values())
 def create_chunks_display_html(chunk_info):
     if not chunk_info:
         return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
+    merged_chunks = merge_table_chunks(chunk_info)
     html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
+    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
+    for i, chunk in enumerate(merged_chunks):
         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
         section_display = get_section_display(chunk)
         formatted_content = get_formatted_content(chunk)

documents_prep.py CHANGED Viewed

@@ -162,30 +162,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    """Format consistent table header"""
-    content = f"ДОКУМЕНТ: {doc_id}\n"
-    content += f"ТАБЛИЦА: {table_identifier}\n"
-    content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
-    content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
-    content += f"{'='*70}\n\n"
-    # Enhanced search keywords
-    content += f"Это таблица {table_identifier} из документа {doc_id}. "
-    content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
-    if section:
-        content += f"Раздел: {section}. "
-        if 'приложени' in section.lower():
-            content += f"Таблица из приложения. "
-    if table_title:
-        content += f"Название: {table_title}. "
-    content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
     if headers:
         header_str = ' | '.join(str(h) for h in headers)

 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    content = f"ТАБЛИЦА {table_identifier} из {doc_id}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
+    content += f"{'='*70}\n"
     if headers:
         header_str = ' | '.join(str(h) for h in headers)

utils.py CHANGED Viewed

@@ -53,19 +53,16 @@ def generate_sources_html(nodes, chunks_df=None):
         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
-        section_id = metadata.get('section_id', '')
-        section_text = metadata.get('section_text', '')
-        section_path = metadata.get('section_path', '')
-        # Create a unique key for grouping
-        if doc_type == 'table':
             table_num = metadata.get('table_number', 'unknown')
             key = f"{doc_id}_table_{table_num}"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             key = f"{doc_id}_image_{image_num}"
         else:
-            # For text documents, group by section path or section id
             section_key = section_path if section_path else section_id
             key = f"{doc_id}_text_{section_key}"
@@ -77,13 +74,14 @@ def generate_sources_html(nodes, chunks_df=None):
                 'sections': set()
             }
-        # Add section information
-        if section_path:
-            sources_by_doc[key]['sections'].add(f"пункт {section_path}")
-        elif section_id and section_id != 'unknown':
-            sources_by_doc[key]['sections'].add(f"пункт {section_id}")
-    # Generate HTML for each unique source
     for source_info in sources_by_doc.values():
         metadata = source_info['metadata']
         doc_type = source_info['doc_type']
@@ -93,7 +91,6 @@ def generate_sources_html(nodes, chunks_df=None):
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             table_title = metadata.get('table_title', '')
@@ -105,23 +102,16 @@ def generate_sources_html(nodes, chunks_df=None):
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             image_title = metadata.get('image_title', '')
-            section = metadata.get('section', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
                 if image_title and image_title != 'unknown':
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
-                if section and section != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
-            else:
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
-        # Add file link if available
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty:

         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
+        if doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             key = f"{doc_id}_table_{table_num}"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             key = f"{doc_id}_image_{image_num}"
         else:
+            section_path = metadata.get('section_path', '')
+            section_id = metadata.get('section_id', '')
             section_key = section_path if section_path else section_id
             key = f"{doc_id}_text_{section_key}"
                 'sections': set()
             }
+        if doc_type not in ['table', 'table_row', 'image']:
+            section_path = metadata.get('section_path', '')
+            section_id = metadata.get('section_id', '')
+            if section_path:
+                sources_by_doc[key]['sections'].add(f"пункт {section_path}")
+            elif section_id and section_id != 'unknown':
+                sources_by_doc[key]['sections'].add(f"пункт {section_id}")
     for source_info in sources_by_doc.values():
         metadata = source_info['metadata']
         doc_type = source_info['doc_type']
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             table_title = metadata.get('table_title', '')
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             image_title = metadata.get('image_title', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
                 if image_title and image_title != 'unknown':
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty: