Spaces:
Sleeping
Sleeping
| from llama_index.core.text_splitter import SentenceSplitter | |
| from llama_index.core import Document | |
| from config import CHUNK_SIZE, CHUNK_OVERLAP | |
| from my_logging import log_message | |
| def normalize_table_number(table_num, section): | |
| """Normalize table numbers for consistent retrieval""" | |
| if not table_num or table_num == 'Неизвестно': | |
| return 'Неизвестно' | |
| # Clean up common prefixes | |
| tn = str(table_num).replace('Таблица', '').replace('№', '').strip() | |
| # Add section context for appendix tables | |
| if section and ('Приложение' in str(section) or 'приложение' in str(section).lower()): | |
| return f"№{tn} ({section})" | |
| return f"№{tn}" | |
| def create_table_content(table_data): | |
| """Create formatted content optimized for semantic search""" | |
| doc_id = ( | |
| table_data.get('document_id') or | |
| table_data.get('document') or | |
| table_data.get('Обозначение документа') or | |
| 'Неизвестно' | |
| ) | |
| table_num = table_data.get('table_number', 'Неизвестно') | |
| table_title = table_data.get('table_title', 'Неизвестно') | |
| section = ( | |
| table_data.get('section') or | |
| table_data.get('Раздел документа') or | |
| 'Неизвестно' | |
| ) | |
| sheet_name = table_data.get('sheet_name', '') | |
| # Enhanced table number with appendix context | |
| normalized_num = normalize_table_number(table_num, section) | |
| if 'Приложени' in str(section): | |
| # Extract appendix number | |
| import re | |
| appendix_match = re.search(r'Приложени[ея]\s*(\d+)', str(section)) | |
| if appendix_match: | |
| appendix_num = appendix_match.group(1) | |
| normalized_num = f"{normalized_num} Приложения {appendix_num}" | |
| # Build searchable header | |
| content = f"Документ: {doc_id}\n" | |
| content += f"Раздел: {section}\n" | |
| content += f"Таблица: {normalized_num}\n" | |
| content += f"Название: {table_title}\n" | |
| if sheet_name: | |
| content += f"Лист: {sheet_name}\n" | |
| content += f"\n" | |
| headers = table_data.get('headers', []) | |
| if headers: | |
| header_str = ' | '.join(str(h) for h in headers) | |
| content += f"Колонки: {header_str}\n\n" | |
| # CRITICAL: Preserve searchable row identifiers | |
| if 'data' in table_data and isinstance(table_data['data'], list): | |
| for row_idx, row in enumerate(table_data['data'], start=1): | |
| if isinstance(row, dict): | |
| # Extract ALL key-value pairs naturally | |
| row_parts = [] | |
| for k, v in row.items(): | |
| if v and str(v).strip() and str(v) != 'nan': | |
| row_parts.append(f"{k}: {v}") | |
| if row_parts: | |
| content += ' | '.join(row_parts) + "\n" | |
| elif isinstance(row, list): | |
| row_str = ' | '.join([str(v) for v in row if v and str(v).strip() and str(v) != 'nan']) | |
| if row_str: | |
| content += row_str + "\n" | |
| return content, normalized_num | |
| def chunk_table_document(doc, chunk_size=None, chunk_overlap=None): | |
| if chunk_size is None: | |
| chunk_size = CHUNK_SIZE | |
| if chunk_overlap is None: | |
| chunk_overlap = CHUNK_OVERLAP | |
| table_num = doc.metadata.get('table_number', 'unknown') | |
| doc_id = doc.metadata.get('document_id', 'unknown') | |
| section = doc.metadata.get('section', 'Неизвестно') | |
| full_table_id = f"{doc_id} | {section} | {table_num}" | |
| lines = doc.text.strip().split('\n') | |
| # Find where data rows start | |
| data_start_idx = 0 | |
| for i, line in enumerate(lines): | |
| if line.startswith('Колонки:'): | |
| data_start_idx = i + 2 # Skip header and blank line | |
| break | |
| table_header = '\n'.join(lines[:data_start_idx]) | |
| data_rows = lines[data_start_idx:] | |
| if not data_rows or len(doc.text) < chunk_size * 1.5: | |
| log_message(f" 📊 {full_table_id}: малая таблица, без разбиения") | |
| return [doc] | |
| log_message(f" 📋 {full_table_id}: {len(data_rows)} строк → chunking") | |
| header_size = len(table_header) | |
| available_size = chunk_size - header_size - 100 | |
| text_chunks = [] | |
| current_chunk_rows = [] | |
| current_size = 0 | |
| for row in data_rows: | |
| row_size = len(row) + 1 | |
| if current_size + row_size > available_size and current_chunk_rows: | |
| chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows) | |
| text_chunks.append(chunk_text) | |
| # Keep last 2 rows for overlap | |
| overlap_count = min(2, len(current_chunk_rows)) | |
| current_chunk_rows = current_chunk_rows[-overlap_count:] | |
| current_size = sum(len(r) + 1 for r in current_chunk_rows) | |
| current_chunk_rows.append(row) | |
| current_size += row_size | |
| if current_chunk_rows: | |
| chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows) | |
| text_chunks.append(chunk_text) | |
| log_message(f" ✂️ {full_table_id} → {len(text_chunks)} чанков") | |
| chunked_docs = [] | |
| for i, chunk_text in enumerate(text_chunks): | |
| chunk_metadata = doc.metadata.copy() | |
| chunk_metadata.update({ | |
| "chunk_id": i, | |
| "total_chunks": len(text_chunks), | |
| "chunk_size": len(chunk_text), | |
| "is_chunked": True, | |
| "full_table_id": full_table_id, | |
| "table_number_normalized": doc.metadata.get('table_number_normalized') | |
| }) | |
| chunked_doc = Document( | |
| text=chunk_text, | |
| metadata=chunk_metadata | |
| ) | |
| chunked_docs.append(chunked_doc) | |
| return chunked_docs | |
| def table_to_document(table_data, document_id=None): | |
| """Convert table data to Document with complete metadata""" | |
| if not isinstance(table_data, dict): | |
| return [] | |
| sheet_doc_id = ( | |
| table_data.get('document_id') or | |
| table_data.get('document') or | |
| table_data.get('Обозначение документа') | |
| ) | |
| doc_id = sheet_doc_id or document_id or 'Неизвестно' | |
| table_num = table_data.get('table_number', 'Неизвестно') | |
| table_title = table_data.get('table_title', 'Неизвестно') | |
| section = table_data.get('section', table_data.get('Раздел документа', 'Неизвестно')) | |
| sheet_name = table_data.get('sheet_name', '') | |
| table_rows = table_data.get('data', []) | |
| if not table_rows: | |
| log_message(f"⚠️ Таблица {table_num} ({doc_id}) пропущена: нет данных") | |
| return [] | |
| content, normalized_num = create_table_content(table_data) | |
| content_size = len(content) | |
| base_doc = Document( | |
| text=content, | |
| metadata={ | |
| "type": "table", | |
| "table_number": table_num, | |
| "table_number_normalized": normalized_num, | |
| "table_title": table_title, | |
| "document_id": doc_id, | |
| "section": section, | |
| "section_id": section, | |
| "sheet_name": sheet_name, | |
| "total_rows": len(table_rows), | |
| "content_size": content_size, | |
| "full_table_id": f"{doc_id} | {section} | {normalized_num}" | |
| } | |
| ) | |
| if content_size > CHUNK_SIZE: | |
| log_message(f"📊 CHUNKING: {doc_id} | {normalized_num} | {content_size} > {CHUNK_SIZE}") | |
| return chunk_table_document(base_doc) | |
| else: | |
| log_message(f"✓ {doc_id} | {normalized_num} ({content_size} символов)") | |
| return [base_doc] | |
| def table_to_document(table_data, document_id=None): | |
| """Convert table data to Document with proper metadata""" | |
| if not isinstance(table_data, dict): | |
| return [] | |
| # FIXED: Extract sheet-level document_id first | |
| sheet_doc_id = ( | |
| table_data.get('document_id') or | |
| table_data.get('document') or | |
| table_data.get('Обозначение документа') | |
| ) | |
| # Use sheet doc_id if available, otherwise use passed document_id | |
| doc_id = sheet_doc_id or document_id or 'Неизвестно' | |
| table_num = table_data.get('table_number', 'Неизвестно') | |
| table_title = table_data.get('table_title', 'Неизвестно') | |
| section = table_data.get('section', table_data.get('Раздел документа', 'Неизвестно')) | |
| table_rows = table_data.get('data', []) | |
| if not table_rows: | |
| log_message(f"⚠️ Таблица {table_num} ({doc_id}) пропущена: нет данных") | |
| return [] | |
| content, normalized_num = create_table_content(table_data) | |
| content_size = len(content) | |
| base_doc = Document( | |
| text=content, | |
| metadata={ | |
| "type": "table", | |
| "table_number": table_num, | |
| "table_number_normalized": normalized_num, | |
| "table_title": table_title, | |
| "document_id": doc_id, | |
| "section": section, | |
| "section_id": section, | |
| "total_rows": len(table_rows), | |
| "content_size": content_size, | |
| "full_table_id": f"{doc_id} | {section} | {normalized_num}" | |
| } | |
| ) | |
| if content_size > CHUNK_SIZE: | |
| log_message(f"📊 CHUNKING: {doc_id} | {normalized_num} | {content_size} > {CHUNK_SIZE}") | |
| return chunk_table_document(base_doc) | |
| else: | |
| log_message(f"✓ {doc_id} | {normalized_num} ({content_size} символов)") | |
| return [base_doc] |