Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

ff92caa

1 Parent(s): abca2ac

added the load_table_data function

Browse files

Files changed (3) hide show

app.py +2 -2
documents_prep.py +96 -5
table_prep.py +0 -5

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 import os
 from llama_index.core import Settings
-from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
-from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys

 import gradio as gr
 import os
 from llama_index.core import Settings
+from documents_prep import *
+from utils import *
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys

documents_prep.py CHANGED Viewed

@@ -48,11 +48,6 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
 def process_documents_with_chunking(documents):
-    """
-    Process all document types with appropriate chunking.
-    Tables: row-block chunking (handled in table_prep.py)
-    Text/Images: sentence-aware chunking
-    """
     all_chunked_docs = []
     stats = {
         'table_whole': 0,
@@ -397,6 +392,102 @@ def load_image_data(repo_id, hf_token, image_data_dir):
         return []
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message("З��гружаю данные чанков из CSV")

 def process_documents_with_chunking(documents):
     all_chunked_docs = []
     stats = {
         'table_whole': 0,
         return []
+def load_table_data(repo_id, hf_token, table_data_dir):
+    """Load and process table data from HuggingFace repo"""
+    log_message("=" * 60)
+    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
+    log_message("=" * 60)
+    try:
+        from huggingface_hub import hf_hub_download, list_repo_files
+        import json
+        from collections import defaultdict
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
+        table_documents = []
+        stats = {
+            'total_tables': 0,
+            'total_size': 0,
+            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
+        }
+        for file_path in table_files:
+            try:
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                log_message(f"\nОбработка файла: {file_path}")
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    table_data = json.load(f)
+                    if isinstance(table_data, dict):
+                        document_id = table_data.get('document', 'unknown')
+                        # Handle multiple sheets
+                        if 'sheets' in table_data:
+                            sorted_sheets = sorted(
+                                table_data['sheets'],
+                                key=lambda sheet: sheet.get('table_number', '')
+                            )
+                            for sheet in sorted_sheets:
+                                sheet['document'] = document_id
+                                docs_list = table_to_document(sheet, document_id)
+                                table_documents.extend(docs_list)
+                                for doc in docs_list:
+                                    stats['total_tables'] += 1
+                                    size = doc.metadata.get('content_size', 0)
+                                    stats['total_size'] += size
+                                    stats['by_document'][document_id]['count'] += 1
+                                    stats['by_document'][document_id]['size'] += size
+                        else:
+                            # Single table
+                            docs_list = table_to_document(table_data, document_id)
+                            table_documents.extend(docs_list)
+                            for doc in docs_list:
+                                stats['total_tables'] += 1
+                                size = doc.metadata.get('content_size', 0)
+                                stats['total_size'] += size
+                                stats['by_document'][document_id]['count'] += 1
+                                stats['by_document'][document_id]['size'] += size
+            except Exception as e:
+                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
+                continue
+        # Log summary
+        log_message("\n" + "=" * 60)
+        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
+        log_message("=" * 60)
+        log_message(f"Всего таблиц: {stats['total_tables']}")
+        log_message(f"Общий размер: {stats['total_size']:,} символов")
+        if stats['total_tables'] > 0:
+            log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
+        log_message("\nПо документам:")
+        for doc_id, doc_stats in sorted(stats['by_document'].items()):
+            log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, {doc_stats['size']:,} символов")
+        log_message("=" * 60)
+        return table_documents
+    except Exception as e:
+        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
+        return []
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message("З��гружаю данные чанков из CSV")

table_prep.py CHANGED Viewed

@@ -32,11 +32,6 @@ def create_table_content(table_data):
 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
-    """
-    Smart table chunking:
-    - Small tables: keep whole
-    - Large tables: split by row-blocks, preserve headers in each chunk
-    """
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
     if chunk_overlap is None:

 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
     if chunk_overlap is None: