Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

35eb459

1 Parent(s): 5ebc241

top k = 150 + max chunk size is 4000 + max rows =15 + sim cut off = 0.45

Browse files

Files changed (3) hide show

documents_prep.py +10 -40
index_retriever.py +4 -4
table_prep.py +107 -107

documents_prep.py CHANGED Viewed

@@ -157,11 +157,18 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    content = f"ТАБЛИЦА {table_identifier} из {doc_id}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
     content += f"{'='*70}\n"
     if headers:
@@ -199,40 +206,6 @@ def format_table_footer(table_identifier, doc_id):
     """Format table footer"""
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
-def load_table_documents(repo_id, hf_token, table_dir):
-    log_message("Loading tables...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
-    all_chunks = []
-    for file_path in table_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
-            for sheet in data.get('sheets', []):
-                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
-                all_chunks.extend(chunks)
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
-    return all_chunks
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
@@ -414,7 +387,6 @@ def extract_sections_from_json(json_path):
 def load_table_documents(repo_id, hf_token, table_dir):
-    """Load and chunk tables"""
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -433,15 +405,13 @@ def load_table_documents(repo_id, hf_token, table_dir):
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            # Extract file-level document_id
             file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
-                # Use sheet-level document_id if available, otherwise use file-level
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # CRITICAL: Pass document_id to chunk function
-                chunks = chunk_table_by_content(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)
         except Exception as e:

 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
+    # Add table type/number prominently for matching
+    if table_num:
+        content += f"ТИП: {table_num}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
     content += f"{'='*70}\n"
     if headers:
     """Format table footer"""
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # Use the consistent MAX_CHARS_TABLE from config
+                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:

index_retriever.py CHANGED Viewed

@@ -46,18 +46,18 @@ def create_query_engine(vector_index):
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=120
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=120,
-            similarity_cutoff=0.35
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=120,
             num_queries=1
         )

         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=150
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=150,
+            similarity_cutoff=0.45
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=150,
             num_queries=1
         )

table_prep.py CHANGED Viewed

@@ -95,135 +95,135 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk
     return chunked_docs
-def table_to_document(table_data, document_id=None):
-    if not isinstance(table_data, dict):
-        return []
-    doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
-    table_num = table_data.get('table_number', 'Неизвестно')
-    table_title = table_data.get('table_title', 'Неизвестно')
-    section = table_data.get('section', 'Неизвестно')
-    table_rows = table_data.get('data', [])
-    if not table_rows:
-        return []
-    # Build table content
-    content = f"Таблица: {table_num}\n"
-    content += f"Название: {table_title}\n"
-    content += f"Документ: {doc_id}\n"
-    content += f"Раздел: {section}\n"
-    headers = table_data.get('headers', [])
-    if headers:
-        content += f"\nЗаголовки: {' | '.join(headers)}\n"
-    content += "\nДанные таблицы:\n"
-    for row_idx, row in enumerate(table_rows, start=1):
-        if isinstance(row, dict):
-            row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
-            content += f"Строка {row_idx}: {row_text}\n"
-    # Create base document
-    base_doc = Document(
-        text=content,
-        metadata={
-            "type": "table",
-            "table_number": table_num,
-            "document_id": doc_id,
-            "section": section
-        }
-    )
-    if len(content) > 4000:
-        chunks = chunk_table_document(base_doc)
-        log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
-        return chunk_table_document(base_doc)
-    return [base_doc]
-def load_table_data(repo_id, hf_token, table_data_dir):
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        stats = {
-            'total_tables': 0,
-            'total_size': 0,
-            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
-        }
-        for file_path in table_files:
-            try:
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                log_message(f"\nОбработка файла: {file_path}")
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                    if isinstance(table_data, dict):
-                        document_id = table_data.get('document', 'unknown')
-                        if 'sheets' in table_data:
-                            sorted_sheets = sorted(
-                                table_data['sheets'],
-                                key=lambda sheet: sheet.get('table_number', '')  # or use 'table_number'
-                            )
-                            for sheet in sorted_sheets:
-                                sheet['document'] = document_id
-                                docs_list = table_to_document(sheet, document_id)
-                                table_documents.extend(docs_list)
-                                for doc in docs_list:
-                                    stats['total_tables'] += 1
-                                    size = doc.metadata.get('content_size', 0)
-                                    stats['total_size'] += size
-                                    stats['by_document'][document_id]['count'] += 1
-                                    stats['by_document'][document_id]['size'] += size
-                                    log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
-                        else:
-                            docs_list = table_to_document(table_data, document_id)
-                            table_documents.extend(docs_list)
-                            for doc in docs_list:
-                                stats['total_tables'] += 1
-                                size = doc.metadata.get('content_size', 0)
-                                stats['total_size'] += size
-                                stats['by_document'][document_id]['count'] += 1
-                                stats['by_document'][document_id]['size'] += size
-            except Exception as e:
-                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
-                continue
-        # Log summary statistics
-        log_message("\n" + "=" * 60)
-        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
-        log_message("=" * 60)
-        log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
-        log_message(f"Общий размер: {stats['total_size']:,} символов")
-        log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
-        log_message("\nПо документам:")
-        for doc_id, doc_stats in sorted(stats['by_document'].items()):
-            log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
-                       f"{doc_stats['size']:,} символов")
-        log_message("=" * 60)
-        return table_documents
-    except Exception as e:
-        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
-        return []

     return chunked_docs
+# def table_to_document(table_data, document_id=None):
+#     if not isinstance(table_data, dict):
+#         return []
+#     doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
+#     table_num = table_data.get('table_number', 'Неизвестно')
+#     table_title = table_data.get('table_title', 'Неизвестно')
+#     section = table_data.get('section', 'Неизвестно')
+#     table_rows = table_data.get('data', [])
+#     if not table_rows:
+#         return []
+#     # Build table content
+#     content = f"Таблица: {table_num}\n"
+#     content += f"Название: {table_title}\n"
+#     content += f"Документ: {doc_id}\n"
+#     content += f"Раздел: {section}\n"
+#     headers = table_data.get('headers', [])
+#     if headers:
+#         content += f"\nЗаголовки: {' | '.join(headers)}\n"
+#     content += "\nДанные таблицы:\n"
+#     for row_idx, row in enumerate(table_rows, start=1):
+#         if isinstance(row, dict):
+#             row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+#             content += f"Строка {row_idx}: {row_text}\n"
+#     # Create base document
+#     base_doc = Document(
+#         text=content,
+#         metadata={
+#             "type": "table",
+#             "table_number": table_num,
+#             "document_id": doc_id,
+#             "section": section
+#         }
+#     )
+#     if len(content) > 4000:
+#         chunks = chunk_table_document(base_doc)
+#         log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
+#         return chunk_table_document(base_doc)
+#     return [base_doc]
+# def load_table_data(repo_id, hf_token, table_data_dir):
+#     try:
+#         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+#         table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
+#         log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
+#         table_documents = []
+#         stats = {
+#             'total_tables': 0,
+#             'total_size': 0,
+#             'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
+#         }
+#         for file_path in table_files:
+#             try:
+#                 local_path = hf_hub_download(
+#                     repo_id=repo_id,
+#                     filename=file_path,
+#                     local_dir='',
+#                     repo_type="dataset",
+#                     token=hf_token
+#                 )
+#                 log_message(f"\nОбработка файла: {file_path}")
+#                 with open(local_path, 'r', encoding='utf-8') as f:
+#                     table_data = json.load(f)
+#                     if isinstance(table_data, dict):
+#                         document_id = table_data.get('document', 'unknown')
+#                         if 'sheets' in table_data:
+#                             sorted_sheets = sorted(
+#                                 table_data['sheets'],
+#                                 key=lambda sheet: sheet.get('table_number', '')  # or use 'table_number'
+#                             )
+#                             for sheet in sorted_sheets:
+#                                 sheet['document'] = document_id
+#                                 docs_list = table_to_document(sheet, document_id)
+#                                 table_documents.extend(docs_list)
+#                                 for doc in docs_list:
+#                                     stats['total_tables'] += 1
+#                                     size = doc.metadata.get('content_size', 0)
+#                                     stats['total_size'] += size
+#                                     stats['by_document'][document_id]['count'] += 1
+#                                     stats['by_document'][document_id]['size'] += size
+#                                     log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
+#                         else:
+#                             docs_list = table_to_document(table_data, document_id)
+#                             table_documents.extend(docs_list)
+#                             for doc in docs_list:
+#                                 stats['total_tables'] += 1
+#                                 size = doc.metadata.get('content_size', 0)
+#                                 stats['total_size'] += size
+#                                 stats['by_document'][document_id]['count'] += 1
+#                                 stats['by_document'][document_id]['size'] += size
+#             except Exception as e:
+#                 log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
+#                 continue
+#         # Log summary statistics
+#         log_message("\n" + "=" * 60)
+#         log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
+#         log_message("=" * 60)
+#         log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
+#         log_message(f"Общий размер: {stats['total_size']:,} символов")
+#         log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
+#         log_message("\nПо документам:")
+#         for doc_id, doc_stats in sorted(stats['by_document'].items()):
+#             log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
+#                        f"{doc_stats['size']:,} символов")
+#         log_message("=" * 60)
+#         return table_documents
+#     except Exception as e:
+#         log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
+#         return []