Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

a85d6bf

1 Parent(s): 65025a2

new version of rag

Browse files

Files changed (3) hide show

app.py +304 -496
documents_prep.py +263 -493
table_prep.py +57 -199

app.py CHANGED Viewed

@@ -1,546 +1,354 @@
-import os
-import json
-import zipfile
-import logging
-from typing import List, Dict, Any
-import pandas as pd
-from huggingface_hub import hf_hub_download, list_repo_files
-from llama_index.core import Document, VectorStoreIndex, KeywordTableIndex, Settings
-from llama_index.core.retrievers import VectorIndexRetriever, QueryFusionRetriever
-from llama_index.retrievers.bm25 import BM25Retriever
-from llama_index.core.query_engine import RetrieverQueryEngine
-from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
-from llama_index.core.text_splitter import SentenceSplitter
-from sentence_transformers import SentenceTransformer
-from llama_index.llms.google_genai import GoogleGenAI
-from llama_index.llms.openai import OpenAI
 import gradio as gr
 import sys
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('rag_system.log'),
-        logging.StreamHandler(sys.stdout)
-    ]
 )
-logger = logging.getLogger(__name__)
-GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
-OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
-HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
-HF_TOKEN = os.getenv('HF_TOKEN')
-AVAILABLE_MODELS = {
-    "Gemini 2.5 Flash": {
-        "provider": "google",
-        "model_name": "gemini-2.5-flash",
-        "api_key": GOOGLE_API_KEY
-    },
-    "Gemini 2.5 Pro": {
-        "provider": "google",
-        "model_name": "gemini-2.5-pro",
-        "api_key": GOOGLE_API_KEY
-    },
-    "GPT-4o": {
-        "provider": "openai",
-        "model_name": "gpt-4o",
-        "api_key": OPENAI_API_KEY
-    },
-    "GPT-4o Mini": {
-        "provider": "openai",
-        "model_name": "gpt-4o-mini",
-        "api_key": OPENAI_API_KEY
-    },
-    "GPT-5": {
-        "provider": "openai",
-        "model_name": "gpt-5",
-        "api_key": OPENAI_API_KEY
-    }
-}
-DEFAULT_MODEL = "Gemini 2.5 Flash"
-DOWNLOAD_DIR = "rag_files"
-JSON_FILES_DIR = "JSON"
-TABLE_DATA_DIR = "Табличные данные_JSON"
-IMAGE_DATA_DIR = "Изображения"
-CHUNK_SIZE = 512
-CHUNK_OVERLAP = 50
-TABLE_MAX_ROWS_PER_CHUNK = 30
-os.makedirs(DOWNLOAD_DIR, exist_ok=True)
-stats = {
-    'total_documents': 0,
-    'total_text_chunks': 0,
-    'total_tables': 0,
-    'total_table_chunks': 0,
-    'total_images': 0,
-    'failed_files': 0,
-    'encoding_errors': []
-}
-def get_llm_model(model_name):
-    try:
-        logger.info(f"Initializing LLM model: {model_name}")
-        model_config = AVAILABLE_MODELS.get(model_name)
-        if not model_config:
-            logger.warning(f"Model {model_name} not found, using default: {DEFAULT_MODEL}")
-            model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
-        if not model_config.get("api_key"):
-            raise Exception(f"API key not found for model {model_name}")
-        if model_config["provider"] == "google":
-            llm = GoogleGenAI(
-                model=model_config["model_name"],
-                api_key=model_config["api_key"]
-            )
-            logger.info(f"Successfully initialized Google model: {model_config['model_name']}")
-            return llm
-        elif model_config["provider"] == "openai":
-            llm = OpenAI(
-                model=model_config["model_name"],
-                api_key=model_config["api_key"]
-            )
-            logger.info(f"Successfully initialized OpenAI model: {model_config['model_name']}")
-            return llm
         else:
-            raise Exception(f"Unsupported provider: {model_config['provider']}")
-    except Exception as e:
-        logger.error(f"Error initializing model {model_name}: {e}")
-        logger.info("Falling back to default Gemini model")
-        return GoogleGenAI(
-            model="gemini-2.0-flash",
-            api_key=GOOGLE_API_KEY
-        )
-def get_embedding_model():
-    logger.info("Initializing embedding model: all-MiniLM-L6-v2")
-    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    logger.info("Embedding model initialized successfully")
-    return embed_model
-def list_zip_files_in_repo(repo_id: str) -> List[str]:
-    logger.info(f"Listing files in repository: {repo_id}")
-    files = list_repo_files(repo_id, repo_type="dataset", token=HF_TOKEN)
-    zip_files = [f for f in files if f.startswith(JSON_FILES_DIR) and f.endswith('.zip')]
-    logger.info(f"Found {len(zip_files)} zip files in {JSON_FILES_DIR} directory")
-    return zip_files
-def download_file_from_hf(repo_id: str, path_in_repo: str, dest_dir: str) -> str:
-    logger.info(f"Downloading file: {path_in_repo}")
-    local_path = hf_hub_download(
-        repo_id=repo_id,
-        filename=path_in_repo,
-        repo_type="dataset",
-        token=HF_TOKEN,
-        local_dir=dest_dir
-    )
-    logger.info(f"File downloaded to: {local_path}")
-    return local_path
-def read_jsons_from_zip(zip_path: str) -> List[Dict[str, Any]]:
-    logger.info(f"Reading JSON files from zip: {zip_path}")
-    docs = []
-    json_count = 0
-    failed_count = 0
-    with zipfile.ZipFile(zip_path, 'r') as z:
-        json_files = [name for name in z.namelist() if name.lower().endswith('.json')]
-        logger.info(f"Found {len(json_files)} JSON files in zip")
-        for name in json_files:
-            try:
-                with z.open(name) as f:
-                    raw_bytes = f.read()
-                    for encoding in ['utf-8', 'utf-8-sig', 'latin-1', 'cp1251', 'windows-1251']:
-                        try:
-                            text = raw_bytes.decode(encoding)
-                            data = json.loads(text)
-                            docs.append(data)
-                            json_count += 1
-                            logger.debug(f"Successfully loaded {name} with encoding {encoding}")
-                            break
-                        except (UnicodeDecodeError, json.JSONDecodeError):
-                            continue
-                    else:
-                        failed_count += 1
-                        stats['failed_files'] += 1
-                        stats['encoding_errors'].append(name)
-                        logger.warning(f"Failed to load {name} - tried all encodings")
-            except Exception as e:
-                failed_count += 1
-                stats['failed_files'] += 1
-                logger.error(f"Error processing {name}: {e}")
-    logger.info(f"Successfully loaded {json_count} JSON files, failed: {failed_count}")
-    return docs
-def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSplitter) -> List[Document]:
-    nodes = splitter.split_text(text)
-    chunks = []
-    for i, node_text in enumerate(nodes):
-        md = dict(doc_meta)
-        md.update({
-            'chunk_id': f"{md.get('document_id','unknown')}_text_{i}",
-            'chunk_type': 'text'
-        })
-        chunks.append(Document(text=node_text, metadata=md))
-    stats['total_text_chunks'] += len(chunks)
-    logger.debug(f"Created {len(chunks)} text chunks for document {doc_meta.get('document_id')}")
-    return chunks
-def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int = TABLE_MAX_ROWS_PER_CHUNK) -> List[Document]:
-    headers = table.get('headers') or []
-    rows = table.get('data') or []
-    stats['total_tables'] += 1
-    if not rows:
-        text = table.get('table_description') or table.get('table_title') or ''
-        md = {**table_meta, 'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_single"}
-        stats['total_table_chunks'] += 1
-        logger.debug(f"Created single chunk for empty table: {table_meta.get('table_title')}")
-        return [Document(text=text, metadata=md)]
-    chunks = []
-    num_chunks = (len(rows) + max_rows - 1) // max_rows
-    for i in range(0, len(rows), max_rows):
-        block = rows[i:i+max_rows]
-        lines = []
-        lines.append(f"Table {table_meta.get('table_number','?')} - {table_meta.get('table_title','')}")
-        lines.append(f"Headers: {headers}")
-        for r in block:
-            row_items = [f"{k}: {v}" for k, v in r.items()]
-            lines.append(" | ".join(row_items))
-        chunk_text = "\n".join(lines)
-        md = dict(table_meta)
-        md.update({'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_{i // max_rows}"})
-        chunks.append(Document(text=chunk_text, metadata=md))
-    stats['total_table_chunks'] += len(chunks)
-    logger.debug(f"Table '{table_meta.get('table_title')}': {len(rows)} rows split into {len(chunks)} chunks")
-    return chunks
-def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Document:
-    txt = f"Image: {image_entry.get('Название изображения') or image_entry.get('title','')}. "
-    txt += f"Описание: {image_entry.get('Описание изображение') or image_entry.get('description','')}. "
-    txt += f"Файл: {image_entry.get('Файл изображения') or image_entry.get('file','')}."
-    md = dict(image_meta)
-    md.update({'chunk_type': 'image', 'chunk_id': f"{image_meta.get('document_id')}_image_{image_entry.get('№ Изображения','0')}"})
-    stats['total_images'] += 1
-    logger.debug(f"Created image chunk: {image_entry.get('Название изображения', 'unknown')}")
-    return Document(text=txt, metadata=md)
-def build_chunks_from_repo(repo_id: str) -> List[Document]:
-    logger.info("=" * 80)
-    logger.info("Starting document processing from repository")
-    logger.info("=" * 80)
-    zip_paths = list_zip_files_in_repo(repo_id)
-    logger.info(f"Total zip files to process: {len(zip_paths)}")
-    splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
-    logger.info(f"Text splitter configured: chunk_size={CHUNK_SIZE}, chunk_overlap={CHUNK_OVERLAP}")
-    all_chunks = []
-    for zip_idx, remote_path in enumerate(zip_paths, 1):
-        logger.info(f"\n[{zip_idx}/{len(zip_paths)}] Processing zip file: {remote_path}")
-        local_zip = download_file_from_hf(repo_id, remote_path, DOWNLOAD_DIR)
-        json_docs = read_jsons_from_zip(local_zip)
-        logger.info(f"Processing {len(json_docs)} documents from {remote_path}")
-        stats['total_documents'] += len(json_docs)
-        for doc_idx, doc in enumerate(json_docs, 1):
-            doc_meta = doc.get('document_metadata', {})
-            doc_id = doc_meta.get('document_id') or doc_meta.get('document_name') or f'unknown_doc_{doc_idx}'
-            base_meta = {'document_id': doc_id, 'document_name': doc_meta.get('document_name','')}
-            logger.info(f"  Document [{doc_idx}/{len(json_docs)}]: {doc_id}")
-            sections = doc.get('sections', [])
-            if sections:
-                logger.info(f"    Processing {len(sections)} text sections")
-                for sec in sections:
-                    sec_meta = dict(base_meta)
-                    sec_meta.update({'section_id': sec.get('section_id'), 'section_title': None})
-                    text = sec.get('section_text') or sec.get('text') or ''
-                    if text and text.strip():
-                        chunks = chunk_text_field(text, sec_meta, splitter)
-                        all_chunks.extend(chunks)
-            tables = doc.get('sheets', []) + doc.get('tables', []) if (doc.get('sheets') or doc.get('tables')) else []
-            if tables:
-                logger.info(f"    Processing {len(tables)} tables")
-                for tbl_idx, sheet in enumerate(tables, 1):
-                    table_meta = dict(base_meta)
-                    table_meta.update({
-                        'sheet_name': sheet.get('sheet_name') or sheet.get('table_title'),
-                        'section': sheet.get('section'),
-                        'table_number': sheet.get('table_number'),
-                        'table_title': sheet.get('table_title')
-                    })
-                    table_chunks = chunk_table(sheet, table_meta, max_rows=TABLE_MAX_ROWS_PER_CHUNK)
-                    all_chunks.extend(table_chunks)
-            images = doc.get('images', []) or doc.get('image_data', []) or doc.get('image_entries', [])
-            if images:
-                logger.info(f"    Processing {len(images)} images")
-                for img in images:
-                    img_meta = dict(base_meta)
-                    chunk = chunk_image(img, img_meta)
-                    all_chunks.append(chunk)
-    logger.info("\n" + "=" * 80)
-    logger.info("PROCESSING SUMMARY")
-    logger.info("=" * 80)
-    logger.info(f"Total documents processed: {stats['total_documents']}")
-    logger.info(f"Total text chunks created: {stats['total_text_chunks']}")
-    logger.info(f"Total tables processed: {stats['total_tables']}")
-    logger.info(f"Total table chunks created: {stats['total_table_chunks']}")
-    logger.info(f"Total images processed: {stats['total_images']}")
-    logger.info(f"Total chunks created: {len(all_chunks)}")
-    logger.info(f"Failed files: {stats['failed_files']}")
-    if stats['encoding_errors']:
-        logger.warning(f"Files with encoding errors ({len(stats['encoding_errors'])}):")
-        for err_file in stats['encoding_errors'][:10]:
-            logger.warning(f"  - {err_file}")
-        if len(stats['encoding_errors']) > 10:
-            logger.warning(f"  ... and {len(stats['encoding_errors']) - 10} more")
-    logger.info("=" * 80)
-    return all_chunks
-def create_hybrid_index(documents):
-    logger.info("Creating hybrid index system")
-    logger.info(f"Building vector index from {len(documents)} documents")
-    vector_index = VectorStoreIndex.from_documents(documents)
-    logger.info("Vector index created successfully")
-    logger.info("Building keyword index")
-    keyword_index = KeywordTableIndex.from_documents(documents)
-    logger.info("Keyword index created successfully")
-    return vector_index, keyword_index
-def create_fusion_retriever(vector_index, keyword_index, documents):
-    logger.info("Creating fusion retriever with multiple retrieval strategies")
-    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
-    logger.info("Vector retriever configured (top_k=5)")
-    bm25_retriever = BM25Retriever.from_defaults(
-        docstore=vector_index.docstore,
-        similarity_top_k=5
-    )
-    logger.info("BM25 retriever configured (top_k=5)")
-    fusion_retriever = QueryFusionRetriever(
-        [vector_retriever, bm25_retriever],
-        similarity_top_k=5,
-        num_queries=1,
-        mode="reciprocal_rerank",
-        use_async=False
-    )
-    logger.info("Fusion retriever created with reciprocal rerank mode")
-    return fusion_retriever
-def create_query_engine(vector_index, keyword_index, documents):
-    logger.info("Creating query engine")
-    fusion_retriever = create_fusion_retriever(vector_index, keyword_index, documents)
-    response_synthesizer = get_response_synthesizer(
-        response_mode=ResponseMode.COMPACT,
-        use_async=False
-    )
-    logger.info("Response synthesizer configured (COMPACT mode)")
-    query_engine = RetrieverQueryEngine(
-        retriever=fusion_retriever,
-        response_synthesizer=response_synthesizer
-    )
-    logger.info("Query engine created successfully")
-    return query_engine
-def initialize_system():
-    logger.info("\n" + "=" * 80)
-    logger.info("INITIALIZING AIEXP RAG SYSTEM")
-    logger.info("=" * 80)
-    embed_model = get_embedding_model()
-    llm = get_llm_model(DEFAULT_MODEL)
-    Settings.embed_model = embed_model
-    Settings.llm = llm
-    Settings.chunk_size = CHUNK_SIZE
-    Settings.chunk_overlap = CHUNK_OVERLAP
-    logger.info("Global settings configured")
-    documents = build_chunks_from_repo(HF_REPO_ID)
-    vector_index, keyword_index = create_hybrid_index(documents)
-    query_engine = create_query_engine(vector_index, keyword_index, documents)
-    logger.info("=" * 80)
-    logger.info("SYSTEM INITIALIZATION COMPLETE")
-    logger.info("=" * 80)
-    return query_engine, vector_index, keyword_index, documents
-def answer_question(question, query_engine):
-    if not question.strip():
-        return "<div style='color: black;'>Please enter a question</div>"
-    try:
-        logger.info(f"Processing query: {question[:100]}...")
-        response = query_engine.query(question)
-        logger.info(f"Query processed, found {len(response.source_nodes)} source nodes")
-        answer_html = f"""
-        <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; color: black;'>
-            <h3 style='color: #007bff;'>Answer:</h3>
-            <p>{response.response}</p>
-        </div>
-        """
-        sources_html = "<div style='background-color: #e9ecef; padding: 15px; border-radius: 8px; color: black;'>"
-        sources_html += "<h4>Sources:</h4>"
-        for i, node in enumerate(response.source_nodes):
-            sources_html += f"""
-            <div style='margin: 10px 0; padding: 10px; background-color: white; border-left: 3px solid #007bff;'>
-                <strong>Document {i+1}:</strong> {node.metadata.get('document_id', 'unknown')}<br>
-                <strong>Score:</strong> {node.score:.3f}<br>
-                <strong>Text:</strong> {node.text[:200]}...
-            </div>
-            """
-        sources_html += "</div>"
-        return answer_html, sources_html
     except Exception as e:
-        logger.error(f"Error processing query: {e}", exc_info=True)
-        error_html = f"<div style='color: red;'>Error: {str(e)}</div>"
-        return error_html, error_html
-def switch_model(model_name, vector_index, keyword_index, documents):
     try:
-        logger.info(f"Switching to model: {model_name}")
         new_llm = get_llm_model(model_name)
         Settings.llm = new_llm
-        new_query_engine = create_query_engine(vector_index, keyword_index, documents)
-        logger.info(f"Successfully switched to model: {model_name}")
-        return new_query_engine, f"✅ Model switched to: {model_name}"
     except Exception as e:
-        logger.error(f"Error switching model: {e}")
-        return None, f"❌ Error: {str(e)}"
 query_engine = None
 vector_index = None
-keyword_index = None
-documents = None
 current_model = DEFAULT_MODEL
 def main_answer_question(question):
-    global query_engine
-    return answer_question(question, query_engine)
 def main_switch_model(model_name):
-    global query_engine, vector_index, keyword_index, documents, current_model
-    new_query_engine, status = switch_model(model_name, vector_index, keyword_index, documents)
     if new_query_engine:
         query_engine = new_query_engine
         current_model = model_name
-    return status
-def create_interface():
-    with gr.Blocks(title="AIEXP - RAG System", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# AIEXP - AI Expert for Regulatory Documentation")
-        with gr.Row():
-            model_dropdown = gr.Dropdown(
-                choices=list(AVAILABLE_MODELS.keys()),
-                value=current_model,
-                label="Select Language Model"
-            )
-            switch_btn = gr.Button("Switch Model")
-            model_status = gr.Textbox(
-                value=f"Current model: {current_model}",
-                label="Model Status",
-                interactive=False
-            )
-        with gr.Row():
-            question_input = gr.Textbox(
-                label="Your Question",
-                placeholder="Ask a question about the documents...",
-                lines=3
-            )
-        ask_btn = gr.Button("Get Answer", variant="primary")
-        with gr.Row():
-            answer_output = gr.HTML(
-                label="Answer",
-                value="<div style='padding: 20px; text-align: center;'>Answer will appear here...</div>"
-            )
-            sources_output = gr.HTML(
-                label="Sources",
-                value="<div style='padding: 20px; text-align: center;'>Sources will appear here...</div>"
-            )
-        switch_btn.click(
-            fn=main_switch_model,
-            inputs=[model_dropdown],
-            outputs=[model_status]
-        )
-        ask_btn.click(
-            fn=main_answer_question,
-            inputs=[question_input],
-            outputs=[answer_output, sources_output]
-        )
-        question_input.submit(
-            fn=main_answer_question,
-            inputs=[question_input],
-            outputs=[answer_output, sources_output]
-        )
-    return demo
 def main():
-    global query_engine, vector_index, keyword_index, documents
-    logger.info("Starting AIEXP - AI Expert for Regulatory Documentation")
-    query_engine, vector_index, keyword_index, documents = initialize_system()
     if query_engine:
-        logger.info("Launching web interface on port 7860")
-        demo = create_interface()
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
-            share=True
         )
     else:
-        logger.error("Failed to initialize system")
         sys.exit(1)
 if __name__ == "__main__":

 import gradio as gr
+import os
+from llama_index.core import Settings
+from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
+from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
+from my_logging import log_message
+from index_retriever import create_vector_index, create_query_engine
 import sys
+from config import (
+    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
+    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
+def create_chunks_display_html(chunk_info):
+    if not chunk_info:
+        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
+    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
+    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
+    for i, chunk in enumerate(chunk_info):
+        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        # Get section display info
+        section_display = get_section_display(chunk)
+        formatted_content = get_formatted_content(chunk)
+        html += f"""
+        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
+            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
+            <strong style='color: black;'>Содержание:</strong><br>
+            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
+                {formatted_content}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def get_section_display(chunk):
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    doc_type = chunk.get('type', 'text')
+    if doc_type == 'table' and chunk.get('table_number'):
+        table_num = chunk.get('table_number')
+        if not str(table_num).startswith('№'):
+            table_num = f"№{table_num}"
+        return f"таблица {table_num}"
+    if doc_type == 'image' and chunk.get('image_number'):
+        image_num = chunk.get('image_number')
+        if not str(image_num).startswith('№'):
+            image_num = f"№{image_num}"
+        return f"рисунок {image_num}"
+    if section_path:
+        return section_path
+    elif section_id and section_id != 'unknown':
+        return section_id
+    return section_id
+def get_formatted_content(chunk):
+    document_id = chunk.get('document_id', 'unknown')
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    section_text = chunk.get('section_text', '')
+    parent_section = chunk.get('parent_section', '')
+    parent_title = chunk.get('parent_title', '')
+    level = chunk.get('level', '')
+    chunk_text = chunk.get('chunk_text', '')
+    doc_type = chunk.get('type', 'text')
+    # For text documents
+    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
+        current_section = section_path if section_path else section_id
+        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
+        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
+    else:
+        current_section = section_path if section_path else section_id
+        clean_text = chunk_text
+        if section_text and chunk_text.startswith(section_text):
+            section_title = section_text
+        elif chunk_text.startswith(f"{current_section} "):
+            clean_text = chunk_text[len(f"{current_section} "):].strip()
+            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
         else:
+            section_title = section_text if section_text else current_section
+        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
+def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
+                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
+                     use_json_instead_csv=False):
+    try:
+        from documents_prep import process_documents_with_chunking
+        log_message("Инициализация системы")
+        os.makedirs(download_dir, exist_ok=True)
+        from config import CHUNK_SIZE, CHUNK_OVERLAP
+        from llama_index.core.text_splitter import TokenTextSplitter
+        embed_model = get_embedding_model()
+        llm = get_llm_model(DEFAULT_MODEL)
+        reranker = get_reranker_model()
+        Settings.embed_model = embed_model
+        Settings.llm = llm
+        Settings.text_splitter = TokenTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            separator=" ",
+            backup_separators=["\n", ".", "!", "?"]
+        )
+        log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
+        log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
+        all_documents = []
+        chunks_df = None
+        chunk_info = []
+        if use_json_instead_csv and json_files_dir:
+            log_message("Используем JSON файлы вместо CSV")
+            json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
+            all_documents.extend(json_documents)
+            chunk_info.extend(json_chunk_info)
+        else:
+            if chunks_filename:
+                log_message("Загружаем данные из CSV")
+                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
+                all_documents.extend(csv_documents)
+        if table_data_dir:
+            log_message("Добавляю табличные данные")
+            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
+            log_message(f"Загружено {len(table_documents)} табличных документов")
+            # Process table documents through chunking
+            chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
+            all_documents.extend(chunked_table_docs)
+            chunk_info.extend(table_chunk_info)
+        if image_data_dir:
+            log_message("Добавляю данные изображений")
+            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
+            log_message(f"Загружено {len(image_documents)} документов изображений")
+            # Process image documents through chunking
+            chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
+            all_documents.extend(chunked_image_docs)
+            chunk_info.extend(image_chunk_info)
+        log_message(f"Всего документов после всей обработки: {len(all_documents)}")
+        vector_index = create_vector_index(all_documents)
+        query_engine = create_query_engine(vector_index)
+        log_message(f"Система успешно инициализирована")
+        return query_engine, chunks_df, reranker, vector_index, chunk_info
     except Exception as e:
+        log_message(f"Ошибка инициализации: {str(e)}")
+        return None, None, None, None, []
+def switch_model(model_name, vector_index):
+    from llama_index.core import Settings
+    from index_retriever import create_query_engine
     try:
+        log_message(f"Переключение на модель: {model_name}")
         new_llm = get_llm_model(model_name)
         Settings.llm = new_llm
+        if vector_index is not None:
+            new_query_engine = create_query_engine(vector_index)
+            log_message(f"Модель успешно переключена на: {model_name}")
+            return new_query_engine, f"✅ Модель переключена на: {model_name}"
+        else:
+            return None, "❌ Ошибка: система не инициализирована"
+    except Exception as e:
+        error_msg = f"Ошибка переключения модели: {str(e)}"
+        log_message(error_msg)
+        return None, f"❌ {error_msg}"
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    if not question.strip():
+        return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
+                "<div style='color: black;'>Источники появятся после обработки запроса</div>",
+                "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
+    try:
+        # Call the answer_question function which returns 3 values
+        answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
+        return answer_html, sources_html, chunks_html
     except Exception as e:
+        log_message(f"Ошибка при ответе на вопрос: {str(e)}")
+        return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
+                "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
+                "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
+def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
+    with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # AIEXP - Artificial Intelligence Expert
+        ## Инструмент для работы с нормативной документацией
+        """)
+        with gr.Tab("Поиск по нормативным документам"):
+            gr.Markdown("### Задайте вопрос по нормативной документации")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_dropdown = gr.Dropdown(
+                        choices=list(AVAILABLE_MODELS.keys()),
+                        value=current_model,
+                        label="Выберите языковую модель",
+                        info="Выберите модель для генерации ответов"
+                    )
+                with gr.Column(scale=1):
+                    switch_btn = gr.Button("Переключить модель", variant="secondary")
+                    model_status = gr.Textbox(
+                        value=f"Текущая модель: {current_model}",
+                        label="Статус модели",
+                        interactive=False
+                    )
+            with gr.Row():
+                with gr.Column(scale=3):
+                    question_input = gr.Textbox(
+                        label="Ваш вопрос к базе знаний",
+                        placeholder="Введите вопрос по нормативным документам...",
+                        lines=3
+                    )
+                    ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
+                    gr.Examples(
+                        examples=[
+                            "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
+                            "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
+                            "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
+                            "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
+                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
+                            "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
+                        ],
+                        inputs=question_input
+                    )
+            with gr.Row():
+                with gr.Column(scale=2):
+                    answer_output = gr.HTML(
+                        label="",
+                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
+                    )
+                with gr.Column(scale=1):
+                    sources_output = gr.HTML(
+                        label="",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
+                    )
+                with gr.Column(scale=1):
+                    chunks_output = gr.HTML(
+                        label="Релевантные чанки",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
+                    )
+            switch_btn.click(
+                fn=switch_model_func,
+                inputs=[model_dropdown],
+                outputs=[model_status]
+            )
+            ask_btn.click(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
+            )
+            question_input.submit(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
+            )
+    return demo
 query_engine = None
+chunks_df = None
+reranker = None
 vector_index = None
 current_model = DEFAULT_MODEL
 def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    answer_html, sources_html, chunks_html = answer_question(
+        question, query_engine, reranker, current_model, chunks_df
+    )
+    return answer_html, sources_html, chunks_html
 def main_switch_model(model_name):
+    global query_engine, vector_index, current_model
+    new_query_engine, status_message = switch_model(model_name, vector_index)
     if new_query_engine:
         query_engine = new_query_engine
         current_model = model_name
+    return status_message
 def main():
+    global query_engine, chunks_df, reranker, vector_index, current_model
+    log_message("Запуск AIEXP - AI Expert для нормативной документации")
+    query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
+        repo_id=HF_REPO_ID,
+        hf_token=HF_TOKEN,
+        download_dir=DOWNLOAD_DIR,
+        json_files_dir=JSON_FILES_DIR,
+        table_data_dir=TABLE_DATA_DIR,
+        image_data_dir=IMAGE_DATA_DIR,
+        use_json_instead_csv=True,
+    )
     if query_engine:
+        log_message("Запуск веб-интерфейса")
+        demo = create_demo_interface(
+            answer_question_func=main_answer_question,
+            switch_model_func=main_switch_model,
+            current_model=current_model,
+            chunk_info=chunk_info
+        )
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
+            share=True,
+            debug=False
         )
     else:
+        log_message("Невозможно запустить приложение из-за ошибки инициализации")
         sys.exit(1)
 if __name__ == "__main__":

documents_prep.py CHANGED Viewed

@@ -1,381 +1,229 @@
 import json
 import zipfile
 import pandas as pd
-from collections import Counter
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
-from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-# ============================================================================
-# TEXT CHUNKING
-# ============================================================================
-def chunk_text_document(doc):
-    """Split text document into chunks using sentence splitter"""
     text_splitter = SentenceSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP,
         separator=" "
     )
     text_chunks = text_splitter.split_text(doc.text)
-    chunked_docs = []
-    for i, chunk_text in enumerate(text_chunks):
-        chunk_metadata = doc.metadata.copy()
-        chunk_metadata.update({
-            "chunk_id": i,
-            "total_chunks": len(text_chunks),
-            "chunk_size": len(chunk_text)
-        })
-        chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
-    return chunked_docs
-# ============================================================================
-# TABLE PROCESSING
-# ============================================================================
-def extract_table_metadata(table_text):
-    """Extract key terms from table for enrichment"""
-    words = table_text.split()
-    # Filter stopwords and short words
-    stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
-    filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
-    # Get top 15 most common terms
-    common = Counter(filtered).most_common(15)
-    key_terms = [w for w, _ in common]
-    return {
-        "summary": f"Таблица содержит {len(words)} слов",
-        "key_terms": key_terms
-    }
-def create_table_content(table_data):
-    """Format table data as text"""
-    doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
-    table_num = table_data.get('table_number', 'Неизвестно')
-    table_title = table_data.get('table_title', 'Неизвестно')
-    section = table_data.get('section', 'Неизвестно')
-    content = f"Таблица: {table_num}\n"
-    content += f"Название: {table_title}\n"
-    content += f"Документ: {doc_id}\n"
-    content += f"Раздел: {section}\n"
-    # Add headers
-    headers = table_data.get('headers', [])
-    if headers:
-        content += f"\nЗаголовки: {' | '.join(headers)}\n"
-    # Add data rows
-    if 'data' in table_data and isinstance(table_data['data'], list):
-        content += "\nДанные таблицы:\n"
-        for row_idx, row in enumerate(table_data['data'], start=1):
-            if isinstance(row, dict):
-                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
-                content += f"Строка {row_idx}: {row_text}\n"
-    return content
-def chunk_table_by_rows(doc):
-    """Split large table into chunks by rows, preserving headers"""
-    # Extract metadata
-    table_metadata = extract_table_metadata(doc.text)
-    table_num = doc.metadata.get('table_number', 'unknown')
-    table_title = doc.metadata.get('table_title', 'unknown')
-    # Parse table structure
-    lines = doc.text.strip().split('\n')
-    # Separate header and data rows
-    table_header_lines = []
-    data_rows = []
-    in_data = False
-    for line in lines:
-        if line.startswith('Данные таблицы:'):
-            in_data = True
-            table_header_lines.append(line)
-        elif in_data and line.startswith('Строка'):
-            data_rows.append(line)
-        elif not in_data:
-            table_header_lines.append(line)
-    table_header = '\n'.join(table_header_lines) + '\n'
-    # If no rows, use standard text splitting
-    if not data_rows:
-        log_message(f"  ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
-        return chunk_text_document(doc)
-    log_message(f"  📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
-    # Row-based chunking
-    header_size = len(table_header)
-    available_size = CHUNK_SIZE - header_size - 300  # Reserve space for enrichment
-    text_chunks = []
-    current_chunk_rows = []
-    current_size = 0
-    for row in data_rows:
-        row_size = len(row) + 1
-        # If adding this row exceeds limit, create chunk
-        if current_size + row_size > available_size and current_chunk_rows:
-            chunk_text = table_header + '\n'.join(current_chunk_rows)
-            text_chunks.append(chunk_text)
-            log_message(f"    ✂️ Создан чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-            # Keep last 2 rows for overlap
-            overlap_count = min(2, len(current_chunk_rows))
-            current_chunk_rows = current_chunk_rows[-overlap_count:]
-            current_size = sum(len(r) + 1 for r in current_chunk_rows)
-        current_chunk_rows.append(row)
-        current_size += row_size
-    # Final chunk
-    if current_chunk_rows:
-        chunk_text = table_header + '\n'.join(current_chunk_rows)
-        text_chunks.append(chunk_text)
-        log_message(f"    ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-    log_message(f"  📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
-    # Create enriched chunks with metadata
     chunked_docs = []
-    key_terms = table_metadata.get("key_terms", [])
     for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
             "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
-            "is_chunked": True,
-            "key_terms": key_terms
         })
-        # Add enrichment prefix
-        terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
-        enriched_text = f"""[Таблица {table_num}: {table_title}]
-[Ключевые термины: {terms_str}]
-{chunk_text}"""
-        chunked_docs.append(Document(text=enriched_text, metadata=chunk_metadata))
     return chunked_docs
-def table_to_document(table_data, document_id=None):
-    """Convert table data to Document, chunking if needed"""
-    if not isinstance(table_data, dict):
-        log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
-        return []
-    doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
-    table_num = table_data.get('table_number', 'Неизвестно')
-    table_title = table_data.get('table_title', 'Неизвестно')
-    section = table_data.get('section', 'Неизвестно')
-    table_rows = table_data.get('data', [])
-    if not table_rows:
-        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} - нет данных")
-        return []
-    content = create_table_content(table_data)
-    content_size = len(content)
-    base_doc = Document(
-        text=content,
-        metadata={
-            "type": "table",
-            "table_number": table_num,
-            "table_title": table_title,
-            "document_id": doc_id,
-            "section": section,
-            "section_id": section,
-            "total_rows": len(table_rows),
-            "content_size": content_size
-        }
-    )
-    # Chunk if needed
-    if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: Таблица {table_num} | Размер: {content_size} > {CHUNK_SIZE}")
-        return chunk_table_by_rows(base_doc)
-    else:
-        log_message(f"✓ Таблица {table_num} | Размер: {content_size} символов | Строк: {len(table_rows)}")
-        return [base_doc]
-def load_table_data(repo_id, hf_token, table_data_dir):
-    """Load all table data from HuggingFace repo"""
-    log_message("=" * 60)
-    log_message("ЗАГРУЗКА ТАБЛИЧНЫХ ДАННЫХ")
-    log_message("=" * 60)
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        for file_path in table_files:
-            try:
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                log_message(f"\nОбработка файла: {file_path}")
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                if isinstance(table_data, dict):
-                    document_id = table_data.get('document', 'unknown')
-                    # Process sheets if present
-                    if 'sheets' in table_data:
-                        sorted_sheets = sorted(
-                            table_data['sheets'],
-                            key=lambda sheet: sheet.get('table_number', '')
-                        )
-                        for sheet in sorted_sheets:
-                            sheet['document'] = document_id
-                            docs_list = table_to_document(sheet, document_id)
-                            table_documents.extend(docs_list)
-                    else:
-                        docs_list = table_to_document(table_data, document_id)
-                        table_documents.extend(docs_list)
-            except Exception as e:
-                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
-                continue
-        log_message(f"\n{'='*60}")
-        log_message(f"Загружено {len(table_documents)} табличных документов")
-        log_message("=" * 60)
-        return table_documents
-    except Exception as e:
-        log_message(f"❌ ОШИБКА загрузки таблиц: {str(e)}")
-        return []
-# ============================================================================
-# JSON TEXT DOCUMENTS
-# ============================================================================
-def extract_section_title(section_text):
-    """Extract clean title from section text"""
-    if not section_text.strip():
-        return ""
-    first_line = section_text.strip().split('\n')[0].strip()
-    if len(first_line) < 200 and not first_line.endswith('.'):
-        return first_line
-    sentences = first_line.split('.')
-    if len(sentences) > 1:
-        return sentences[0].strip()
-    return first_line[:100] + "..." if len(first_line) > 100 else first_line
 def extract_text_from_json(data, document_id, document_name):
-    """Extract text documents from JSON structure"""
     documents = []
-    if 'sections' not in data:
-        return documents
-    for section in data['sections']:
-        section_id = section.get('section_id', 'Unknown')
-        section_text = section.get('section_text', '')
-        if section_text.strip():
             section_title = extract_section_title(section_text)
-            doc = Document(
-                text=section_text,
-                metadata={
-                    "type": "text",
-                    "document_id": document_id,
-                    "document_name": document_name,
-                    "section_id": section_id,
-                    "section_text": section_title[:200],
-                    "section_path": section_id,
-                    "level": "section"
-                }
-            )
-            documents.append(doc)
-        # Process subsections recursively
-        if 'subsections' in section:
-            for subsection in section['subsections']:
-                subsection_id = subsection.get('subsection_id', 'Unknown')
-                subsection_text = subsection.get('subsection_text', '')
-                if subsection_text.strip():
                     subsection_title = extract_section_title(subsection_text)
-                    doc = Document(
-                        text=subsection_text,
-                        metadata={
-                            "type": "text",
-                            "document_id": document_id,
-                            "document_name": document_name,
-                            "section_id": subsection_id,
-                            "section_text": subsection_title[:200],
-                            "section_path": f"{section_id}.{subsection_id}",
-                            "level": "subsection",
-                            "parent_section": section_id
-                        }
-                    )
-                    documents.append(doc)
     return documents
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
-    """Load JSON documents from HuggingFace repo"""
-    log_message("=" * 60)
-    log_message("ЗАГРУЗКА JSON ДОКУМЕНТОВ")
-    log_message("=" * 60)
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
         json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} JSON файлов")
         all_documents = []
-        # Process ZIP files
         for zip_file_path in zip_files:
             try:
-                log_message(f"Загружаю ZIP: {zip_file_path}")
                 local_zip_path = hf_hub_download(
                     repo_id=repo_id,
                     filename=zip_file_path,
@@ -384,30 +232,17 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                     token=hf_token
                 )
-                with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
-                    json_files_in_zip = [f for f in zip_ref.namelist()
-                                        if f.endswith('.json') and not f.startswith('__MACOSX')]
-                    for json_file in json_files_in_zip:
-                        with zip_ref.open(json_file) as f:
-                            json_data = json.load(f)
-                        metadata = json_data.get('document_metadata', {})
-                        doc_id = metadata.get('document_id', 'unknown')
-                        doc_name = metadata.get('document_name', 'unknown')
-                        docs = extract_text_from_json(json_data, doc_id, doc_name)
-                        all_documents.extend(docs)
-                log_message(f"Извлечено документов из ZIP: {len(all_documents)}")
             except Exception as e:
-                log_message(f"❌ ОШИБКА ZIP {zip_file_path}: {str(e)}")
                 continue
-        # Process direct JSON files
         for file_path in json_files:
             try:
                 local_path = hf_hub_download(
                     repo_id=repo_id,
                     filename=file_path,
@@ -419,52 +254,100 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                 with open(local_path, 'r', encoding='utf-8') as f:
                     json_data = json.load(f)
-                metadata = json_data.get('document_metadata', {})
-                doc_id = metadata.get('document_id', 'unknown')
-                doc_name = metadata.get('document_name', 'unknown')
-                docs = extract_text_from_json(json_data, doc_id, doc_name)
-                all_documents.extend(docs)
             except Exception as e:
-                log_message(f"❌ ОШИБКА JSON {file_path}: {str(e)}")
                 continue
-        log_message(f"Всего загружено {len(all_documents)} текстовых документов")
-        # Chunk all documents
         chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
-        log_message(f"После chunking: {len(chunked_documents)} чанков")
-        log_message("=" * 60)
         return chunked_documents, chunk_info
     except Exception as e:
-        log_message(f"❌ ОШИБКА загрузки JSON: {str(e)}")
         return [], []
-# ============================================================================
-# IMAGE DATA
-# ============================================================================
 def load_image_data(repo_id, hf_token, image_data_dir):
-    """Load image metadata from CSV files"""
-    log_message("=" * 60)
-    log_message("ЗАГРУЗК�� ДАННЫХ ИЗОБРАЖЕНИЙ")
-    log_message("=" * 60)
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
         log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
         image_documents = []
         for file_path in image_files:
             try:
                 local_path = hf_hub_download(
                     repo_id=repo_id,
                     filename=file_path,
@@ -474,14 +357,18 @@ def load_image_data(repo_id, hf_token, image_data_dir):
                 )
                 df = pd.read_csv(local_path)
-                log_message(f"Загружено {len(df)} изображений из {file_path}")
                 for _, row in df.iterrows():
                     content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
                     content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
-                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
                     content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
-                    content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
                     doc = Document(
                         text=content,
@@ -489,147 +376,29 @@ def load_image_data(repo_id, hf_token, image_data_dir):
                             "type": "image",
                             "image_number": str(row.get('№ Изображения', 'unknown')),
                             "image_title": str(row.get('Название изображения', 'unknown')),
                             "document_id": str(row.get('Обозначение документа', 'unknown')),
-                            "section": str(row.get('Раздел документа', 'unknown'))
                         }
                     )
                     image_documents.append(doc)
             except Exception as e:
-                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
                 continue
-        log_message(f"Загружено {len(image_documents)} документов изображений")
-        log_message("=" * 60)
         return image_documents
     except Exception as e:
-        log_message(f"❌ ОШИБКА загрузки изображений: {str(e)}")
         return []
-# ============================================================================
-# DOCUMENT PROCESSING WITH CHUNKING
-# ============================================================================
-def process_documents_with_chunking(documents):
-    """Process all documents and chunk if needed"""
-    all_chunked_docs = []
-    chunk_info = []
-    stats = {
-        'text_chunks': 0,
-        'table_whole': 0,
-        'table_chunks': 0,
-        'image_whole': 0,
-        'image_chunks': 0
-    }
-    for doc in documents:
-        doc_type = doc.metadata.get('type', 'text')
-        is_already_chunked = doc.metadata.get('is_chunked', False)
-        doc_size = len(doc.text)
-        # Tables - already chunked or whole
-        if doc_type == 'table':
-            if is_already_chunked:
-                stats['table_chunks'] += 1
-            else:
-                stats['table_whole'] += 1
-            all_chunked_docs.append(doc)
-            chunk_info.append({
-                'document_id': doc.metadata.get('document_id', 'unknown'),
-                'section_id': doc.metadata.get('section_id', 'unknown'),
-                'chunk_id': doc.metadata.get('chunk_id', 0),
-                'total_chunks': doc.metadata.get('total_chunks', 1),
-                'chunk_size': doc_size,
-                'chunk_preview': doc.text[:200] + "..." if doc_size > 200 else doc.text,
-                'type': 'table',
-                'table_number': doc.metadata.get('table_number', 'unknown')
-            })
-        # Images - chunk if too large
-        elif doc_type == 'image':
-            if doc_size > CHUNK_SIZE:
-                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | Размер: {doc_size}")
-                chunked_docs = chunk_text_document(doc)
-                stats['image_chunks'] += len(chunked_docs)
-                all_chunked_docs.extend(chunked_docs)
-                for i, chunk_doc in enumerate(chunked_docs):
-                    chunk_info.append({
-                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
-                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                        'chunk_id': i,
-                        'chunk_size': len(chunk_doc.text),
-                        'chunk_preview': chunk_doc.text[:200] + "...",
-                        'type': 'image',
-                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
-                    })
-            else:
-                stats['image_whole'] += 1
-                all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': 0,
-                    'chunk_size': doc_size,
-                    'chunk_preview': doc.text[:200] + "...",
-                    'type': 'image',
-                    'image_number': doc.metadata.get('image_number', 'unknown')
-                })
-        # Text - chunk if too large
-        else:
-            if doc_size > CHUNK_SIZE:
-                log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | Размер: {doc_size}")
-                chunked_docs = chunk_text_document(doc)
-                stats['text_chunks'] += len(chunked_docs)
-                all_chunked_docs.extend(chunked_docs)
-                for i, chunk_doc in enumerate(chunked_docs):
-                    chunk_info.append({
-                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
-                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                        'chunk_id': i,
-                        'chunk_size': len(chunk_doc.text),
-                        'chunk_preview': chunk_doc.text[:200] + "...",
-                        'type': 'text'
-                    })
-            else:
-                all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': 0,
-                    'chunk_size': doc_size,
-                    'chunk_preview': doc.text[:200] + "...",
-                    'type': 'text'
-                })
-    # Log summary
-    log_message(f"\n{'='*60}")
-    log_message("ИТОГОВАЯ СТАТИСТИКА:")
-    log_message(f"  • Текстовые чанки: {stats['text_chunks']}")
-    log_message(f"  • Таблицы (целые): {stats['table_whole']}")
-    log_message(f"  • Таблицы (чанки): {stats['table_chunks']}")
-    log_message(f"  • Изображения (целые): {stats['image_whole']}")
-    log_message(f"  • Изображения (чанки): {stats['image_chunks']}")
-    log_message(f"  • ВСЕГО ДОКУМЕНТОВ: {len(all_chunked_docs)}")
-    log_message(f"{'='*60}\n")
-    return all_chunked_docs, chunk_info
-# ============================================================================
-# CSV CHUNKS (Legacy support)
-# ============================================================================
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
-    """Load pre-chunked data from CSV (legacy support)"""
-    log_message("Загрузка данны�� из CSV")
     try:
         chunks_csv_path = hf_hub_download(
@@ -643,16 +412,17 @@ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
         chunks_df = pd.read_csv(chunks_csv_path)
         log_message(f"Загружено {len(chunks_df)} чанков из CSV")
-        # Find text column
         text_column = None
         for col in chunks_df.columns:
-            if any(keyword in col.lower() for keyword in ['text', 'content', 'chunk']):
                 text_column = col
                 break
         if text_column is None:
             text_column = chunks_df.columns[0]
         documents = []
         for i, (_, row) in enumerate(chunks_df.iterrows()):
             doc = Document(
@@ -665,9 +435,9 @@ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
             )
             documents.append(doc)
-        log_message(f"Создано {len(documents)} документов из CSV")
         return documents, chunks_df
     except Exception as e:
-        log_message(f"❌ ОШИБКА загрузки CSV: {str(e)}")
         return [], None

 import json
 import zipfile
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
+from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+from table_prep import table_to_document, load_table_data
+def chunk_document(doc, chunk_size=None, chunk_overlap=None):
+    """
+    Universal chunking for text and images.
+    Tables use their own row-block chunking.
+    """
+    if chunk_size is None:
+        chunk_size = CHUNK_SIZE
+    if chunk_overlap is None:
+        chunk_overlap = CHUNK_OVERLAP
+    # Use sentence-aware splitting
     text_splitter = SentenceSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
         separator=" "
     )
     text_chunks = text_splitter.split_text(doc.text)
     chunked_docs = []
     for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
             "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
+            "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
         })
+        chunked_doc = Document(
+            text=chunk_text,
+            metadata=chunk_metadata
+        )
+        chunked_docs.append(chunked_doc)
     return chunked_docs
+def process_documents_with_chunking(documents):
+    """
+    Process all document types with appropriate chunking.
+    Tables: row-block chunking (handled in table_prep.py)
+    Text/Images: sentence-aware chunking
+    """
+    all_chunked_docs = []
+    stats = {
+        'table_whole': 0,
+        'table_chunks': 0,
+        'image_whole': 0,
+        'image_chunks': 0,
+        'text_chunks': 0
+    }
+    for doc in documents:
+        doc_type = doc.metadata.get('type', 'text')
+        is_already_chunked = doc.metadata.get('is_chunked', False)
+        # Tables: already chunked in table_prep.py if needed
+        if doc_type == 'table':
+            if is_already_chunked:
+                stats['table_chunks'] += 1
+            else:
+                stats['table_whole'] += 1
+            all_chunked_docs.append(doc)
+        # Images: chunk if too large
+        elif doc_type == 'image':
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | {doc_size} > {CHUNK_SIZE}")
+                chunked_docs = chunk_document(doc)
+                stats['image_chunks'] += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+            else:
+                stats['image_whole'] += 1
+                all_chunked_docs.append(doc)
+        # Text: chunk if too large
+        else:
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | {doc_size} > {CHUNK_SIZE}")
+                chunked_docs = chunk_document(doc)
+                stats['text_chunks'] += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+            else:
+                all_chunked_docs.append(doc)
+    log_message(f"\n{'='*60}")
+    log_message(f"СТАТИСТИКА ОБРАБОТКИ:")
+    log_message(f"  • Таблицы (целые): {stats['table_whole']}")
+    log_message(f"  • Таблицы (чанки): {stats['table_chunks']}")
+    log_message(f"  • Изображения (целые): {stats['image_whole']}")
+    log_message(f"  • Изображения (чанки): {stats['image_chunks']}")
+    log_message(f"  • Текстовые чанки: {stats['text_chunks']}")
+    log_message(f"  • ВСЕГО: {len(all_chunked_docs)}")
+    log_message(f"{'='*60}\n")
+    return all_chunked_docs, []  # Second return value for backward compatibility
 def extract_text_from_json(data, document_id, document_name):
     documents = []
+    if 'sections' in data:
+        for section in data['sections']:
+            section_id = section.get('section_id', 'Unknown')
+            section_text = section.get('section_text', '')
+            section_path = f"{section_id}"
             section_title = extract_section_title(section_text)
+            if section_text.strip():
+                doc = Document(
+                    text=section_text,
+                    metadata={
+                        "type": "text",
+                        "document_id": document_id,
+                        "document_name": document_name,
+                        "section_id": section_id,
+                        "section_text": section_title[:200],
+                        "section_path": section_path,
+                        "level": "section"
+                    }
+                )
+                documents.append(doc)
+            if 'subsections' in section:
+                for subsection in section['subsections']:
+                    subsection_id = subsection.get('subsection_id', 'Unknown')
+                    subsection_text = subsection.get('subsection_text', '')
                     subsection_title = extract_section_title(subsection_text)
+                    subsection_path = f"{section_path}.{subsection_id}"
+                    if subsection_text.strip():
+                        doc = Document(
+                            text=subsection_text,
+                            metadata={
+                                "type": "text",
+                                "document_id": document_id,
+                                "document_name": document_name,
+                                "section_id": subsection_id,
+                                "section_text": subsection_title[:200],
+                                "section_path": subsection_path,
+                                "level": "subsection",
+                                "parent_section": section_id,
+                                "parent_title": section_title[:100]
+                            }
+                        )
+                        documents.append(doc)
+                    if 'sub_subsections' in subsection:
+                        for sub_subsection in subsection['sub_subsections']:
+                            sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
+                            sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
+                            sub_subsection_title = extract_section_title(sub_subsection_text)
+                            sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
+                            if sub_subsection_text.strip():
+                                doc = Document(
+                                    text=sub_subsection_text,
+                                    metadata={
+                                        "type": "text",
+                                        "document_id": document_id,
+                                        "document_name": document_name,
+                                        "section_id": sub_subsection_id,
+                                        "section_text": sub_subsection_title[:200],
+                                        "section_path": sub_subsection_path,
+                                        "level": "sub_subsection",
+                                        "parent_section": subsection_id,
+                                        "parent_title": subsection_title[:100]
+                                    }
+                                )
+                                documents.append(doc)
+                            if 'sub_sub_subsections' in sub_subsection:
+                                for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
+                                    sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
+                                    sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
+                                    sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
+                                    if sub_sub_subsection_text.strip():
+                                        doc = Document(
+                                            text=sub_sub_subsection_text,
+                                            metadata={
+                                                "type": "text",
+                                                "document_id": document_id,
+                                                "document_name": document_name,
+                                                "section_id": sub_sub_subsection_id,
+                                                "section_text": sub_sub_subsection_title[:200],
+                                                "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
+                                                "level": "sub_sub_subsection",
+                                                "parent_section": sub_subsection_id,
+                                                "parent_title": sub_subsection_title[:100]
+                                            }
+                                        )
+                                        documents.append(doc)
     return documents
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
+    log_message("Начинаю загрузку JSON документов")
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
         json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
         all_documents = []
         for zip_file_path in zip_files:
             try:
+                log_message(f"Загружаю ZIP архив: {zip_file_path}")
                 local_zip_path = hf_hub_download(
                     repo_id=repo_id,
                     filename=zip_file_path,
                     token=hf_token
                 )
+                documents = extract_zip_and_process_json(local_zip_path)
+                all_documents.extend(documents)
+                log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
             except Exception as e:
+                log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
                 continue
         for file_path in json_files:
             try:
+                log_message(f"Обрабатываю прямой JSON файл: {file_path}")
                 local_path = hf_hub_download(
                     repo_id=repo_id,
                     filename=file_path,
                 with open(local_path, 'r', encoding='utf-8') as f:
                     json_data = json.load(f)
+                document_metadata = json_data.get('document_metadata', {})
+                document_id = document_metadata.get('document_id', 'unknown')
+                document_name = document_metadata.get('document_name', 'unknown')
+                documents = extract_text_from_json(json_data, document_id, document_name)
+                all_documents.extend(documents)
+                log_message(f"Извлечено {len(documents)} документов из {file_path}")
             except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
+        log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
+        # Process documents through chunking function
         chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
+        log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
         return chunked_documents, chunk_info
     except Exception as e:
+        log_message(f"Ошибка загрузки JSON документов: {str(e)}")
         return [], []
+def extract_section_title(section_text):
+    if not section_text.strip():
+        return ""
+    lines = section_text.strip().split('\n')
+    first_line = lines[0].strip()
+    if len(first_line) < 200 and not first_line.endswith('.'):
+        return first_line
+    # Otherwise, extract first sentence
+    sentences = first_line.split('.')
+    if len(sentences) > 1:
+        return sentences[0].strip()
+    return first_line[:100] + "..." if len(first_line) > 100 else first_line
+def extract_zip_and_process_json(zip_path):
+    documents = []
+    try:
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_files = zip_ref.namelist()
+            json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
+            log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
+            for json_file in json_files:
+                try:
+                    log_message(f"Обрабатываю файл из архива: {json_file}")
+                    with zip_ref.open(json_file) as f:
+                        json_data = json.load(f)
+                    document_metadata = json_data.get('document_metadata', {})
+                    document_id = document_metadata.get('document_id', 'unknown')
+                    document_name = document_metadata.get('document_name', 'unknown')
+                    docs = extract_text_from_json(json_data, document_id, document_name)
+                    documents.extend(docs)
+                    log_message(f"Извлечено {len(docs)} документов из {json_file}")
+                except Exception as e:
+                    log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
+                    continue
+    except Exception as e:
+        log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
+    return documents
 def load_image_data(repo_id, hf_token, image_data_dir):
+    log_message("Начинаю загрузку данных изображений")
+    image_files = []
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        for file in files:
+            if file.startswith(image_data_dir) and file.endswith('.csv'):
+                image_files.append(file)
         log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
         image_documents = []
         for file_path in image_files:
             try:
+                log_message(f"Обрабатываю файл изображений: {file_path}")
                 local_path = hf_hub_download(
                     repo_id=repo_id,
                     filename=file_path,
                 )
                 df = pd.read_csv(local_path)
+                log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
+                # Обработка с правильными названиями колонок
                 for _, row in df.iterrows():
+                    section_value = row.get('Раздел документа', 'Неизвестно')
                     content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
                     content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
+                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"  # Опечатка в названии колонки
                     content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
+                    content += f"Раздел: {section_value}\n"
+                    content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
                     doc = Document(
                         text=content,
                             "type": "image",
                             "image_number": str(row.get('№ Изображения', 'unknown')),
                             "image_title": str(row.get('Название изображения', 'unknown')),
+                            "image_description": str(row.get('Описание изображение', 'unknown')),
                             "document_id": str(row.get('Обозначение документа', 'unknown')),
+                            "file_path": str(row.get('Файл изображения', 'unknown')),
+                            "section": str(section_value),
+                            "section_id": str(section_value)
                         }
                     )
                     image_documents.append(doc)
             except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
+        log_message(f"Создано {len(image_documents)} документов из изображений")
         return image_documents
     except Exception as e:
+        log_message(f"Ошибка загрузки данных изображений: {str(e)}")
         return []
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
+    log_message("Загружаю данные чанков из CSV")
     try:
         chunks_csv_path = hf_hub_download(
         chunks_df = pd.read_csv(chunks_csv_path)
         log_message(f"Загружено {len(chunks_df)} чанков из CSV")
         text_column = None
         for col in chunks_df.columns:
+            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
                 text_column = col
                 break
         if text_column is None:
             text_column = chunks_df.columns[0]
+        log_message(f"Использую колонку: {text_column}")
         documents = []
         for i, (_, row) in enumerate(chunks_df.iterrows()):
             doc = Document(
             )
             documents.append(doc)
+        log_message(f"Создано {len(documents)} текстовых документов из CSV")
         return documents, chunks_df
     except Exception as e:
+        log_message(f"Ошибка загрузки CSV данных: {str(e)}")
         return [], None

table_prep.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from collections import defaultdict
-import json
-from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
 def create_table_content(table_data):
@@ -11,6 +10,7 @@ def create_table_content(table_data):
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
     content = f"Таблица: {table_num}\n"
     content += f"Название: {table_title}\n"
     content += f"Документ: {doc_id}\n"
@@ -20,6 +20,7 @@ def create_table_content(table_data):
     if headers:
         content += f"\nЗаголовки: {' | '.join(headers)}\n"
     if 'data' in table_data and isinstance(table_data['data'], list):
         content += "\nДанные таблицы:\n"
         for row_idx, row in enumerate(table_data['data'], start=1):
@@ -29,42 +30,24 @@ def create_table_content(table_data):
     return content
-from llama_index.core.text_splitter import SentenceSplitter
-from config import CHUNK_SIZE, CHUNK_OVERLAP
-def extract_table_metadata(table_text: str) -> dict:
-    words = table_text.split()
-    unique_words = set(words)
-    from collections import Counter
-    stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
-    filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
-    common = Counter(filtered).most_common(15)
-    key_terms = [w for w, _ in common]
-    return {
-        "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
-        "materials": [],   # if you want to extract material names, hook in regex or LLM here
-        "key_terms": key_terms
-    }
 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
     if chunk_overlap is None:
         chunk_overlap = CHUNK_OVERLAP
-    # Extract critical metadata from table before chunking
-    table_metadata = extract_table_metadata(doc.text)
     table_num = doc.metadata.get('table_number', 'unknown')
-    table_title = doc.metadata.get('table_title', 'unknown')
     doc_id = doc.metadata.get('document_id', 'unknown')
-    section = doc.metadata.get('section', 'unknown')
-    # Parse table structure from your create_table_content format
     lines = doc.text.strip().split('\n')
-    # Find where data rows start
     table_header_lines = []
     data_rows = []
     in_data = False
@@ -80,96 +63,68 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
     table_header = '\n'.join(table_header_lines) + '\n'
-    if not data_rows:
-        log_message(f"  ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
-        text_splitter = SentenceSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-            separator="\n"
-        )
-        text_chunks = text_splitter.split_text(doc.text)
-        log_message(f"  📊 Стандартное разбиение: {len(text_chunks)} чанков")
-    else:
-        # Row-based chunking
-        log_message(f"  📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
-        header_size = len(table_header)
-        # Reserve space for enrichment prefix
-        available_size = chunk_size - header_size - 300
-        text_chunks = []
-        current_chunk_rows = []
-        current_size = 0
-        for row in data_rows:
-            row_size = len(row) + 1
-            # Check if adding this row exceeds limit
-            if current_size + row_size > available_size and current_chunk_rows:
-                # Create chunk
-                chunk_text = table_header + '\n'.join(current_chunk_rows)
-                text_chunks.append(chunk_text)
-                log_message(f"    ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-                # Overlap: keep last 2 rows
-                overlap_count = min(2, len(current_chunk_rows))
-                current_chunk_rows = current_chunk_rows[-overlap_count:]
-                current_size = sum(len(r) + 1 for r in current_chunk_rows)
-            current_chunk_rows.append(row)
-            current_size += row_size
-        # Final chunk
-        if current_chunk_rows:
             chunk_text = table_header + '\n'.join(current_chunk_rows)
             text_chunks.append(chunk_text)
-            log_message(f"    ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-    log_message(f"  📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
-    # Create enriched chunks
-    chunked_docs = []
-    materials = table_metadata.get("materials", [])
-    key_terms = table_metadata.get("key_terms", [])
     for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
             "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
-            "is_chunked": True,
-            "materials": materials,
-            "key_terms": key_terms,
-            "table_summary": table_metadata.get("summary", "")
         })
-        # Enrichment prefix
-        materials_str = ', '.join(materials[:10]) if materials else 'нет'
-        terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
-        enriched_text = f"""[Таблица {table_num}: {table_title}]
-[Материалы в таблице: {materials_str}]
-[Ключевые термины: {terms_str}]
-{chunk_text}"""
-        log_message(f"    ✓ Чанк {i+1}/{len(text_chunks)}: "
-                   f"размер={len(enriched_text)}, "
-                   f"материалов={len(materials)}, "
-                   f"терминов={len(key_terms)}")
         chunked_doc = Document(
-            text=enriched_text,
             metadata=chunk_metadata
         )
         chunked_docs.append(chunked_doc)
     return chunked_docs
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
-        log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
         return []
     doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
@@ -178,13 +133,12 @@ def table_to_document(table_data, document_id=None):
     section = table_data.get('section', 'Неизвестно')
     table_rows = table_data.get('data', [])
-    if not table_rows or len(table_rows) == 0:
-        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
         return []
     content = create_table_content(table_data)
     content_size = len(content)
-    row_count = len(table_rows)
     base_doc = Document(
         text=content,
@@ -195,111 +149,15 @@ def table_to_document(table_data, document_id=None):
             "document_id": doc_id,
             "section": section,
             "section_id": section,
-            "total_rows": row_count,
             "content_size": content_size
         }
     )
     if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
-                   f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
-        chunked_docs = chunk_table_document(base_doc)
-        log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
-        for i, chunk_doc in enumerate(chunked_docs):
-            log_message(f"    Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
-        return chunked_docs
     else:
-        log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
-                   f"Размер: {content_size} символов | Строк: {row_count}")
-        return [base_doc]
-def load_table_data(repo_id, hf_token, table_data_dir):
-    log_message("=" * 60)
-    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
-    log_message("=" * 60)
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        stats = {
-            'total_tables': 0,
-            'total_size': 0,
-            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
-        }
-        for file_path in table_files:
-            try:
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                log_message(f"\nОбработка файла: {file_path}")
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                    if isinstance(table_data, dict):
-                        document_id = table_data.get('document', 'unknown')
-                        if 'sheets' in table_data:
-                            sorted_sheets = sorted(
-                                table_data['sheets'],
-                                key=lambda sheet: sheet.get('table_number', '')  # or use 'table_number'
-                            )
-                            for sheet in sorted_sheets:
-                                sheet['document'] = document_id
-                                docs_list = table_to_document(sheet, document_id)
-                                table_documents.extend(docs_list)
-                                for doc in docs_list:
-                                    stats['total_tables'] += 1
-                                    size = doc.metadata.get('content_size', 0)
-                                    stats['total_size'] += size
-                                    stats['by_document'][document_id]['count'] += 1
-                                    stats['by_document'][document_id]['size'] += size
-                        else:
-                            docs_list = table_to_document(table_data, document_id)
-                            table_documents.extend(docs_list)
-                            for doc in docs_list:
-                                stats['total_tables'] += 1
-                                size = doc.metadata.get('content_size', 0)
-                                stats['total_size'] += size
-                                stats['by_document'][document_id]['count'] += 1
-                                stats['by_document'][document_id]['size'] += size
-            except Exception as e:
-                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
-                continue
-        # Log summary statistics
-        log_message("\n" + "=" * 60)
-        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
-        log_message("=" * 60)
-        log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
-        log_message(f"Общий размер: {stats['total_size']:,} символов")
-        log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
-        log_message("\nПо документам:")
-        for doc_id, doc_stats in sorted(stats['by_document'].items()):
-            log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
-                       f"{doc_stats['size']:,} символов")
-        log_message("=" * 60)
-        return table_documents
-    except Exception as e:
-        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
-        return []

+from llama_index.core.text_splitter import SentenceSplitter
 from llama_index.core import Document
+from config import CHUNK_SIZE, CHUNK_OVERLAP
 from my_logging import log_message
 def create_table_content(table_data):
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
+    # Header section
     content = f"Таблица: {table_num}\n"
     content += f"Название: {table_title}\n"
     content += f"Документ: {doc_id}\n"
     if headers:
         content += f"\nЗаголовки: {' | '.join(headers)}\n"
+    # Data section
     if 'data' in table_data and isinstance(table_data['data'], list):
         content += "\nДанные таблицы:\n"
         for row_idx, row in enumerate(table_data['data'], start=1):
     return content
 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
+    """
+    Smart table chunking:
+    - Small tables: keep whole
+    - Large tables: split by row-blocks, preserve headers in each chunk
+    """
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
     if chunk_overlap is None:
         chunk_overlap = CHUNK_OVERLAP
     table_num = doc.metadata.get('table_number', 'unknown')
     doc_id = doc.metadata.get('document_id', 'unknown')
+    # Parse table structure
     lines = doc.text.strip().split('\n')
     table_header_lines = []
     data_rows = []
     in_data = False
     table_header = '\n'.join(table_header_lines) + '\n'
+    # If no data rows or small table, use standard splitting
+    if not data_rows or len(doc.text) < chunk_size * 1.5:
+        log_message(f"  📊 Таблица {table_num}: малая, без разбиения")
+        return [doc]
+    # Row-block chunking for large tables
+    log_message(f"  📋 Таблица {table_num}: {len(data_rows)} строк → row-block chunking")
+    header_size = len(table_header)
+    available_size = chunk_size - header_size - 100  # Reserve space
+    text_chunks = []
+    current_chunk_rows = []
+    current_size = 0
+    for row in data_rows:
+        row_size = len(row) + 1
+        # Check if adding this row exceeds limit
+        if current_size + row_size > available_size and current_chunk_rows:
+            # Create chunk with header + rows
             chunk_text = table_header + '\n'.join(current_chunk_rows)
             text_chunks.append(chunk_text)
+            # Overlap: keep last 2 rows for context continuity
+            overlap_count = min(2, len(current_chunk_rows))
+            current_chunk_rows = current_chunk_rows[-overlap_count:]
+            current_size = sum(len(r) + 1 for r in current_chunk_rows)
+        current_chunk_rows.append(row)
+        current_size += row_size
+    # Final chunk
+    if current_chunk_rows:
+        chunk_text = table_header + '\n'.join(current_chunk_rows)
+        text_chunks.append(chunk_text)
+    log_message(f"  ✂️ Таблица {table_num} → {len(text_chunks)} чанков")
+    # Create Document objects
+    chunked_docs = []
     for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
             "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
+            "is_chunked": True
         })
         chunked_doc = Document(
+            text=chunk_text,
             metadata=chunk_metadata
         )
         chunked_docs.append(chunked_doc)
     return chunked_docs
 def table_to_document(table_data, document_id=None):
+    """Convert table data to Document, with smart chunking if needed"""
     if not isinstance(table_data, dict):
         return []
     doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
     table_rows = table_data.get('data', [])
+    if not table_rows:
+        log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
         return []
     content = create_table_content(table_data)
     content_size = len(content)
     base_doc = Document(
         text=content,
             "document_id": doc_id,
             "section": section,
             "section_id": section,
+            "total_rows": len(table_rows),
             "content_size": content_size
         }
     )
+    # Apply smart chunking if too large
     if content_size > CHUNK_SIZE:
+        log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
+        return chunk_table_document(base_doc)
     else:
+        log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов)")
+        return [base_doc]