Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 15, 2025

Commit

d490230

1 Parent(s): a5d5837

added new window for chunking results + added hybrid approach for chunking max limit is 2048"

Browse files

Files changed (2) hide show

app.py +36 -7
documents_prep.py +66 -3

app.py CHANGED Viewed

@@ -11,6 +11,29 @@ from config import (
     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
@@ -83,7 +106,7 @@ def switch_model(model_name, vector_index):
         log_message(error_msg)
         return None, f"❌ {error_msg}"
-def create_demo_interface(answer_question_func, switch_model_func, current_model):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
@@ -92,7 +115,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
         ## Инструмент для работы с нормативной документацией
         """)
-        with gr.Tab("🏠 Поиск по нормативным документам"):
             gr.Markdown("### Задайте вопрос по нормативной документации")
             with gr.Row():
@@ -100,11 +123,11 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     model_dropdown = gr.Dropdown(
                         choices=list(AVAILABLE_MODELS.keys()),
                         value=current_model,
-                        label="🤖 Выберите языковую модель",
                         info="Выберите модель для генерации ответов"
                     )
                 with gr.Column(scale=1):
-                    switch_btn = gr.Button("🔄 Переключить модель", variant="secondary")
                     model_status = gr.Textbox(
                         value=f"Текущая модель: {current_model}",
                         label="Статус модели",
@@ -118,15 +141,13 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                         placeholder="Введите вопрос по нормативным документам...",
                         lines=3
                     )
-                    ask_btn = gr.Button("🔍 Найти ответ", variant="primary", size="lg")
                     gr.Examples(
                         examples=[
                             "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
                             "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
-                            "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
-                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
                         ],
                         inputs=question_input
                     )
@@ -161,6 +182,14 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                 inputs=[question_input],
                 outputs=[answer_output, sources_output]
             )
     return demo

     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
+def create_chunks_display_html(chunk_info):
+    if not chunk_info:
+        return "<div style='padding: 20px; text-align: center;'>Нет данных о чанках</div>"
+    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px;'>"
+    html += f"<h4>Всего чанков: {len(chunk_info)}</h4>"
+    for i, chunk in enumerate(chunk_info):
+        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        html += f"""
+        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff;'>
+            <strong>Документ:</strong> {chunk['document_id']}<br>
+            <strong>Раздел:</strong> {chunk['section_id']}<br>
+            <strong>Чанк:</strong> {chunk['chunk_id']} | <strong>Размер:</strong> {chunk['chunk_size']} символов<br>
+            <strong>Содержание:</strong><br>
+            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px;'>
+                {chunk['chunk_preview']}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
         log_message(error_msg)
         return None, f"❌ {error_msg}"
+def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         ## Инструмент для работы с нормативной документацией
         """)
+        with gr.Tab("Поиск по нормативным документам"):
             gr.Markdown("### Задайте вопрос по нормативной документации")
             with gr.Row():
                     model_dropdown = gr.Dropdown(
                         choices=list(AVAILABLE_MODELS.keys()),
                         value=current_model,
+                        label="Выберите языковую модель",
                         info="Выберите модель для генерации ответов"
                     )
                 with gr.Column(scale=1):
+                    switch_btn = gr.Button("Переключить модель", variant="secondary")
                     model_status = gr.Textbox(
                         value=f"Текущая модель: {current_model}",
                         label="Статус модели",
                         placeholder="Введите вопрос по нормативным документам...",
                         lines=3
                     )
+                    ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
                     gr.Examples(
                         examples=[
                             "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
                             "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
                         ],
                         inputs=question_input
                     )
                 inputs=[question_input],
                 outputs=[answer_output, sources_output]
             )
+        with gr.Tab("Просмотр чанков"):
+            gr.Markdown("### Содержание обработанных чанков документов")
+            chunks_display = gr.HTML(
+                value=create_chunks_display_html(chunk_info),
+                label="Информация о чанках"
+            )
     return demo

documents_prep.py CHANGED Viewed

@@ -4,8 +4,67 @@ import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
 def extract_text_from_json(data, document_id, document_name):
     documents = []
@@ -162,12 +221,16 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                 log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
-        log_message(f"Всего создано {len(all_documents)} текстовых документов")
-        return all_documents
     except Exception as e:
         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
-        return []
 def extract_section_title(section_text):

 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
+from llama_index.core.text_splitter import SentenceSplitter
+from config import CHUNK_SIZE, CHUNK_OVERLAP
+def chunk_document(doc, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
+    text_splitter = SentenceSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separator=" "
+    )
+    text_chunks = text_splitter.split_text(doc.text)
+    chunked_docs = []
+    for i, chunk_text in enumerate(text_chunks):
+        chunk_metadata = doc.metadata.copy()
+        chunk_metadata.update({
+            "chunk_id": i,
+            "total_chunks": len(text_chunks),
+            "chunk_size": len(chunk_text),
+            "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
+        })
+        chunked_doc = Document(
+            text=chunk_text,
+            metadata=chunk_metadata
+        )
+        chunked_docs.append(chunked_doc)
+    return chunked_docs
+def process_documents_with_chunking(documents):
+    all_chunked_docs = []
+    chunk_info = []
+    for doc in documents:
+        if len(doc.text) > CHUNK_SIZE:
+            chunked_docs = chunk_document(doc)
+            all_chunked_docs.extend(chunked_docs)
+            for i, chunk_doc in enumerate(chunked_docs):
+                chunk_info.append({
+                    'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                    'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': i,
+                    'chunk_size': len(chunk_doc.text),
+                    'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text
+                })
+        else:
+            all_chunked_docs.append(doc)
+            chunk_info.append({
+                'document_id': doc.metadata.get('document_id', 'unknown'),
+                'section_id': doc.metadata.get('section_id', 'unknown'),
+                'chunk_id': 0,
+                'chunk_size': len(doc.text),
+                'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text
+            })
+    return all_chunked_docs, chunk_info
 def extract_text_from_json(data, document_id, document_name):
     documents = []
                 log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
+        chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
+        log_message(f"Всего создано {len(all_documents)} исходных документов")
+        log_message(f"После chunking получено {len(chunked_documents)} чанков")
+        return chunked_documents, chunk_info
     except Exception as e:
         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
+        return [], []
 def extract_section_title(section_text):