Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

9da507d

1 Parent(s): a42e1ff

eski holat with utils simplified

Browse files

Files changed (2) hide show

index_retriever.py +2 -2
table_prep.py +59 -66

index_retriever.py CHANGED Viewed

@@ -46,12 +46,12 @@ def create_query_engine(vector_index):
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=40
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=40,
             similarity_cutoff=0.65
         )

         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=50
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=50,
             similarity_cutoff=0.65
         )

table_prep.py CHANGED Viewed

@@ -35,128 +35,121 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
 def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
     """Simple table chunking: max 5 rows or 2000 chars per chunk"""
-    table_num = doc.metadata.get('table_number', 'unknown')
-    # Parse table
     lines = doc.text.strip().split('\n')
-    table_header_lines = []
     data_rows = []
     in_data = False
     for line in lines:
         if line.startswith('Данные таблицы:'):
             in_data = True
-            table_header_lines.append(line)
         elif in_data and line.startswith('Строка'):
             data_rows.append(line)
         elif not in_data:
-            table_header_lines.append(line)
-    table_header = '\n'.join(table_header_lines) + '\n'
     if not data_rows:
-        # No rows, return as is
         return [doc]
-    log_message(f"Таблица {table_num}: {len(data_rows)} строк")
-    # Simple chunking
     chunks = []
-    current_chunk_rows = []
-    current_size = len(table_header)
     for row in data_rows:
-        row_size = len(row) + 1
-        # Check if adding this row exceeds limits
-        if (len(current_chunk_rows) >= max_rows_per_chunk or
-            current_size + row_size > max_chunk_size) and current_chunk_rows:
             # Save current chunk
-            chunk_text = table_header + '\n'.join(current_chunk_rows)
             chunks.append(chunk_text)
-            log_message(f"  Чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-            # Start new chunk with overlap of 1 row
-            if len(current_chunk_rows) > 0:
-                current_chunk_rows = [current_chunk_rows[-1]]
-                current_size = len(table_header) + len(current_chunk_rows[0]) + 1
-            else:
-                current_chunk_rows = []
-                current_size = len(table_header)
-        current_chunk_rows.append(row)
         current_size += row_size
-    # Final chunk
-    if current_chunk_rows:
-        chunk_text = table_header + '\n'.join(current_chunk_rows)
         chunks.append(chunk_text)
-        log_message(f"  Последний чанк: {len(current_chunk_rows)} строк")
-    log_message(f"Таблица {table_num} разделена на {len(chunks)} чанков")
-    # Create documents
     chunked_docs = []
     for i, chunk_text in enumerate(chunks):
-        chunk_metadata = doc.metadata.copy()
-        chunk_metadata.update({
-            "chunk_id": i,
-            "total_chunks": len(chunks),
-            "chunk_size": len(chunk_text),
-            "is_chunked": True
-        })
-        chunked_doc = Document(text=chunk_text, metadata=chunk_metadata)
-        chunked_docs.append(chunked_doc)
     return chunked_docs
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
-        log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
         return []
     doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
     table_rows = table_data.get('data', [])
-    if not table_rows or len(table_rows) == 0:
-        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
         return []
-    content = create_table_content(table_data)
-    content_size = len(content)
-    row_count = len(table_rows)
     base_doc = Document(
         text=content,
         metadata={
             "type": "table",
             "table_number": table_num,
-            "table_title": table_title,
             "document_id": doc_id,
-            "section": section,
-            "section_id": section,
-            "total_rows": row_count,
-            "content_size": content_size
         }
     )
-    if content_size > CHUNK_SIZE:
-        chunked_docs = chunk_table_document(base_doc)
-        log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
-        for i, chunk_doc in enumerate(chunked_docs):
-            log_message(f"    Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
-        return chunked_docs
-    else:
-        log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
-                   f"Размер: {content_size} символов | Строк: {row_count}")
-        return [base_doc]
 def load_table_data(repo_id, hf_token, table_data_dir):

 def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
     """Simple table chunking: max 5 rows or 2000 chars per chunk"""
     lines = doc.text.strip().split('\n')
+    # Separate header and data rows
+    header_lines = []
     data_rows = []
     in_data = False
     for line in lines:
         if line.startswith('Данные таблицы:'):
             in_data = True
+            header_lines.append(line)
         elif in_data and line.startswith('Строка'):
             data_rows.append(line)
         elif not in_data:
+            header_lines.append(line)
+    header = '\n'.join(header_lines) + '\n'
+    # No rows to chunk
     if not data_rows:
         return [doc]
+    # Chunk the data rows
     chunks = []
+    current_rows = []
+    current_size = len(header)
     for row in data_rows:
+        row_size = len(row) + 1  # +1 for newline
+        # Check if we need to create a new chunk
+        if (len(current_rows) >= max_rows_per_chunk or
+            current_size + row_size > max_chunk_size) and current_rows:
             # Save current chunk
+            chunk_text = header + '\n'.join(current_rows)
             chunks.append(chunk_text)
+            # Start new chunk (keep last row for overlap)
+            current_rows = [current_rows[-1]]
+            current_size = len(header) + len(current_rows[0]) + 1
+        current_rows.append(row)
         current_size += row_size
+    # Add final chunk
+    if current_rows:
+        chunk_text = header + '\n'.join(current_rows)
         chunks.append(chunk_text)
+    # Create Document objects
     chunked_docs = []
     for i, chunk_text in enumerate(chunks):
+        chunk_doc = Document(
+            text=chunk_text,
+            metadata={
+                "type": "table",
+                "table_number": doc.metadata.get('table_number'),
+                "document_id": doc.metadata.get('document_id'),
+                "section": doc.metadata.get('section'),
+                "chunk_id": i,
+                "total_chunks": len(chunks),
+                "is_chunked": True
+            }
+        )
+        chunked_docs.append(chunk_doc)
     return chunked_docs
 def table_to_document(table_data, document_id=None):
+    """Convert table data to Document, chunk if needed"""
     if not isinstance(table_data, dict):
         return []
     doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
     table_rows = table_data.get('data', [])
+    if not table_rows:
         return []
+    # Build table content
+    content = f"Таблица: {table_num}\n"
+    content += f"Название: {table_title}\n"
+    content += f"Документ: {doc_id}\n"
+    content += f"Раздел: {section}\n"
+    headers = table_data.get('headers', [])
+    if headers:
+        content += f"\nЗаголовки: {' | '.join(headers)}\n"
+    content += "\nДанные таблицы:\n"
+    for row_idx, row in enumerate(table_rows, start=1):
+        if isinstance(row, dict):
+            row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+            content += f"Строка {row_idx}: {row_text}\n"
+    # Create base document
     base_doc = Document(
         text=content,
         metadata={
             "type": "table",
             "table_number": table_num,
             "document_id": doc_id,
+            "section": section
         }
     )
+    if len(content) > 2000:
+        return chunk_table_document(base_doc)
+    return [base_doc]
 def load_table_data(repo_id, hf_token, table_data_dir):