MrSimple07 commited on
Commit
f9e7c0c
·
1 Parent(s): b01a551

added the load_table_data function

Browse files
Files changed (3) hide show
  1. documents_prep.py +43 -110
  2. index_retriever.py +62 -126
  3. table_prep.py +68 -76
documents_prep.py CHANGED
@@ -392,120 +392,53 @@ def load_image_data(repo_id, hf_token, image_data_dir):
392
  return []
393
 
394
  def load_table_data(repo_id, hf_token, table_data_dir):
395
- """Load and process table data from HuggingFace repo"""
396
- log_message("=" * 60)
397
- log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
398
- log_message("=" * 60)
399
 
400
- try:
401
- from huggingface_hub import hf_hub_download, list_repo_files
402
- import json
403
- from collections import defaultdict
404
-
405
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
406
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
407
-
408
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
409
-
410
- table_documents = []
411
- stats = {
412
- 'total_tables': 0,
413
- 'total_size': 0,
414
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
415
- }
416
-
417
- for file_path in table_files:
418
- try:
419
- local_path = hf_hub_download(
420
- repo_id=repo_id,
421
- filename=file_path,
422
- local_dir='',
423
- repo_type="dataset",
424
- token=hf_token
425
- )
426
-
427
- log_message(f"\nОбработка файла: {file_path}")
428
 
429
- with open(local_path, 'r', encoding='utf-8') as f:
430
- table_data = json.load(f)
 
 
 
 
 
431
 
432
- if isinstance(table_data, dict):
433
- # FIXED: Properly extract document_id from multiple possible sources
434
- document_id = (
435
- table_data.get('document_id') or
436
- table_data.get('document') or
437
- table_data.get('Обозначение документа') or
438
- 'unknown'
439
- )
440
-
441
- # Handle multiple sheets
442
- if 'sheets' in table_data:
443
- sorted_sheets = sorted(
444
- table_data['sheets'],
445
- key=lambda sheet: sheet.get('table_number', '')
446
- )
447
-
448
- for sheet in sorted_sheets:
449
- # FIXED: Ensure document_id is always set in sheet data
450
- if 'document' not in sheet and 'document_id' not in sheet:
451
- sheet['document'] = document_id
452
- sheet['document_id'] = document_id
453
-
454
- # FIXED: Pass document_id explicitly
455
- docs_list = table_to_document(sheet, document_id=document_id)
456
- table_documents.extend(docs_list)
457
-
458
- for doc in docs_list:
459
- stats['total_tables'] += 1
460
- size = doc.metadata.get('content_size', 0)
461
- stats['total_size'] += size
462
- stats['by_document'][document_id]['count'] += 1
463
- stats['by_document'][document_id]['size'] += size
464
- else:
465
- # Single table - FIXED: Ensure document_id is in table_data
466
- if 'document_id' not in table_data:
467
- table_data['document_id'] = document_id
468
- if 'document' not in table_data:
469
- table_data['document'] = document_id
470
-
471
- docs_list = table_to_document(table_data, document_id=document_id)
472
  table_documents.extend(docs_list)
473
-
474
- for doc in docs_list:
475
- stats['total_tables'] += 1
476
- size = doc.metadata.get('content_size', 0)
477
- stats['total_size'] += size
478
- stats['by_document'][document_id]['count'] += 1
479
- stats['by_document'][document_id]['size'] += size
480
-
481
- except Exception as e:
482
- log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
483
- import traceback
484
- log_message(f"Traceback: {traceback.format_exc()}")
485
- continue
486
-
487
- # Log summary
488
- log_message("\n" + "=" * 60)
489
- log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
490
- log_message("=" * 60)
491
- log_message(f"Всего таблиц: {stats['total_tables']}")
492
- log_message(f"Общий размер: {stats['total_size']:,} символов")
493
- if stats['total_tables'] > 0:
494
- log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
495
-
496
- log_message("\nПо документам:")
497
- for doc_id, doc_stats in sorted(stats['by_document'].items()):
498
- log_message(f" • {doc_id}: {doc_stats['count']} таблиц, {doc_stats['size']:,} символов")
499
-
500
- log_message("=" * 60)
501
-
502
- return table_documents
503
-
504
- except Exception as e:
505
- log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
506
- import traceback
507
- log_message(f"Traceback: {traceback.format_exc()}")
508
- return []
509
 
510
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
511
  log_message("Загружаю данные чанков из CSV")
 
392
  return []
393
 
394
  def load_table_data(repo_id, hf_token, table_data_dir):
395
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
396
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
 
 
397
 
398
+ table_documents = []
399
+
400
+ for file_path in table_files:
401
+ try:
402
+ local_path = hf_hub_download(
403
+ repo_id=repo_id,
404
+ filename=file_path,
405
+ local_dir='',
406
+ repo_type="dataset",
407
+ token=hf_token
408
+ )
409
+
410
+ with open(local_path, 'r', encoding='utf-8') as f:
411
+ table_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
+ if isinstance(table_data, dict):
414
+ document_id = (
415
+ table_data.get('document_id') or
416
+ table_data.get('document') or
417
+ table_data.get('Обозначение документа') or
418
+ 'unknown'
419
+ )
420
 
421
+ if 'НП-104-18' in str(document_id):
422
+ document_id = 'ГОСТ 59023'
423
+
424
+ if 'sheets' in table_data:
425
+ for sheet in table_data['sheets']:
426
+ sheet['document_id'] = document_id
427
+ sheet['document'] = document_id
428
+ docs_list = table_to_document(sheet, document_id=document_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  table_documents.extend(docs_list)
430
+ else:
431
+ table_data['document_id'] = document_id
432
+ table_data['document'] = document_id
433
+ docs_list = table_to_document(table_data, document_id=document_id)
434
+ table_documents.extend(docs_list)
435
+
436
+ except Exception as e:
437
+ log_message(f"Ошибка {file_path}: {str(e)}")
438
+ continue
439
+
440
+ return table_documents
441
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
444
  log_message("Загружаю данные чанков из CSV")
index_retriever.py CHANGED
@@ -13,141 +13,77 @@ def create_vector_index(documents):
13
  return VectorStoreIndex.from_documents(documents)
14
 
15
  def create_query_engine(vector_index):
16
- try:
17
- # FIXED: Significantly increased retrieval for tables and lowered BM25 threshold
18
- bm25_retriever = BM25Retriever.from_defaults(
19
- docstore=vector_index.docstore,
20
- similarity_top_k=80 # Increased from 50
21
- )
22
-
23
- vector_retriever = VectorIndexRetriever(
24
- index=vector_index,
25
- similarity_top_k=80, # Increased from 50
26
- similarity_cutoff=0.45 # FIXED: Lowered from 0.55 to catch more tables
27
- )
28
-
29
- hybrid_retriever = QueryFusionRetriever(
30
- [vector_retriever, bm25_retriever],
31
- similarity_top_k=100, # Increased from 60 to ensure tables aren't filtered early
32
- num_queries=1
33
- )
34
-
35
- custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
36
- response_synthesizer = get_response_synthesizer(
37
- response_mode=ResponseMode.TREE_SUMMARIZE,
38
- text_qa_template=custom_prompt_template
39
- )
40
-
41
- query_engine = RetrieverQueryEngine(
42
- retriever=hybrid_retriever,
43
- response_synthesizer=response_synthesizer
44
- )
45
-
46
- log_message("Query engine успешно создан с улучшенными параметрами поиска таблиц")
47
- return query_engine
48
-
49
- except Exception as e:
50
- log_message(f"Ошибка создания query engine: {str(e)}")
51
- raise
52
 
53
 
54
- def rerank_nodes(query, nodes, reranker, top_k=40, min_score_threshold=0.35, diversity_penalty=0.15): # FIXED: More lenient
55
  if not nodes or not reranker:
56
  return nodes[:top_k]
57
 
58
- try:
59
- log_message(f"Переранжирую {len(nodes)} узлов")
60
-
61
- pairs = [[query, node.text] for node in nodes]
62
- scores = reranker.predict(pairs)
 
 
 
 
 
63
  scored_nodes = list(zip(nodes, scores))
64
-
65
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
 
 
 
 
 
 
 
 
66
 
67
- # FIXED: Much lower threshold and special boost for tables
68
- table_boost = 0.15 # Boost table scores
69
- boosted_scored_nodes = []
70
- for node, score in scored_nodes:
71
- metadata = node.metadata if hasattr(node, 'metadata') else {}
72
- if metadata.get('type') == 'table':
73
- boosted_score = min(1.0, score * (1 + table_boost))
74
- boosted_scored_nodes.append((node, boosted_score))
75
- else:
76
- boosted_scored_nodes.append((node, score))
77
-
78
- boosted_scored_nodes.sort(key=lambda x: x[1], reverse=True)
79
-
80
- if min_score_threshold is not None:
81
- filtered_nodes = [(node, score) for node, score in boosted_scored_nodes
82
- if score >= min_score_threshold]
83
- log_message(f"После фильтрации по порогу {min_score_threshold}: {len(filtered_nodes)} узлов")
84
- if filtered_nodes:
85
- scored_nodes = filtered_nodes
86
- else:
87
- # Fallback: take top nodes even if below threshold
88
- log_message("⚠️ Нет узлов после фильтрации, беру топ-40 без порога")
89
- scored_nodes = boosted_scored_nodes[:40]
90
- else:
91
- scored_nodes = boosted_scored_nodes
92
 
93
- selected_nodes = []
94
- selected_docs = set()
95
- selected_sections = set()
96
- selected_tables = set()
97
- selected_appendix_tables = set() # FIXED: Track appendix tables separately
98
 
99
- for node, score in scored_nodes:
100
- if len(selected_nodes) >= top_k:
101
- break
102
-
103
- metadata = node.metadata if hasattr(node, 'metadata') else {}
104
- doc_id = metadata.get('document_id', 'unknown')
105
- node_type = metadata.get('type', 'text')
106
- section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
107
-
108
- # FIXED: Better table tracking with appendix awareness
109
- if node_type == 'table':
110
- table_num = metadata.get('table_number_clean', metadata.get('table_number', ''))
111
- appendix_num = metadata.get('appendix_number')
112
- if appendix_num:
113
- table_key = f"{doc_id}_appendix_{appendix_num}_table_{table_num}"
114
- else:
115
- table_key = f"{doc_id}_table_{table_num}"
116
- else:
117
- table_key = None
118
-
119
- # FIXED: Even lower diversity penalty for tables
120
- penalty = 0
121
- if node_type == 'table':
122
- # Tables get minimal penalty - we want all relevant tables
123
- if table_key and table_key in selected_tables:
124
- penalty += diversity_penalty * 0.2
125
- else:
126
- penalty += diversity_penalty * 0.05 if doc_id in selected_docs else 0
127
- else:
128
- if doc_id in selected_docs:
129
- penalty += diversity_penalty * 0.5
130
- if section_key in selected_sections:
131
- penalty += diversity_penalty
132
-
133
  adjusted_score = score * (1 - penalty)
134
-
135
- # FIXED: Very lenient threshold for adding nodes
136
- if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.3:
137
- selected_nodes.append((node, score))
138
- selected_docs.add(doc_id)
139
- selected_sections.add(section_key)
140
- if table_key:
141
- selected_tables.add(table_key)
142
-
143
- log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
144
- log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}, таблиц: {len(selected_tables)}")
145
-
146
- if selected_nodes:
147
- log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
148
-
149
- return [node for node, score in selected_nodes]
150
 
151
- except Exception as e:
152
- log_message(f"Ошибка переранжировки: {str(e)}")
153
- return nodes[:top_k]
 
 
13
  return VectorStoreIndex.from_documents(documents)
14
 
15
  def create_query_engine(vector_index):
16
+ bm25_retriever = BM25Retriever.from_defaults(
17
+ docstore=vector_index.docstore,
18
+ similarity_top_k=80
19
+ )
20
+
21
+ vector_retriever = VectorIndexRetriever(
22
+ index=vector_index,
23
+ similarity_top_k=80,
24
+ similarity_cutoff=0.45
25
+ )
26
+
27
+ hybrid_retriever = QueryFusionRetriever(
28
+ [vector_retriever, bm25_retriever],
29
+ similarity_top_k=100,
30
+ num_queries=1
31
+ )
32
+
33
+ custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
34
+ response_synthesizer = get_response_synthesizer(
35
+ response_mode=ResponseMode.TREE_SUMMARIZE,
36
+ text_qa_template=custom_prompt_template
37
+ )
38
+
39
+ query_engine = RetrieverQueryEngine(
40
+ retriever=hybrid_retriever,
41
+ response_synthesizer=response_synthesizer
42
+ )
43
+
44
+ return query_engine
 
 
 
 
 
 
 
45
 
46
 
47
+ def rerank_nodes(query, nodes, reranker, top_k=40, min_score_threshold=0.35, diversity_penalty=0.15):
48
  if not nodes or not reranker:
49
  return nodes[:top_k]
50
 
51
+ pairs = [[query, node.text] for node in nodes]
52
+ scores = reranker.predict(pairs)
53
+ scored_nodes = list(zip(nodes, scores))
54
+ scored_nodes.sort(key=lambda x: x[1], reverse=True)
55
+
56
+ if min_score_threshold:
57
+ scored_nodes = [(node, score) for node, score in scored_nodes
58
+ if score >= min_score_threshold]
59
+
60
+ if not scored_nodes:
61
  scored_nodes = list(zip(nodes, scores))
 
62
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
63
+ scored_nodes = scored_nodes[:top_k]
64
+
65
+ selected = []
66
+ seen_docs = {}
67
+
68
+ for node, score in scored_nodes:
69
+ if len(selected) >= top_k:
70
+ break
71
 
72
+ meta = node.metadata if hasattr(node, 'metadata') else {}
73
+ doc_id = meta.get('document_id', 'unknown')
74
+ node_type = meta.get('type', 'text')
75
+ table_num = meta.get('table_number', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ key = f"{doc_id}_{table_num}" if node_type == 'table' else f"{doc_id}_{meta.get('section_id', '')}"
 
 
 
 
78
 
79
+ if key in seen_docs:
80
+ penalty = diversity_penalty * 0.2 if node_type == 'table' else diversity_penalty
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  adjusted_score = score * (1 - penalty)
82
+ else:
83
+ adjusted_score = score
84
+ seen_docs[key] = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ if not selected or adjusted_score >= selected[0][1] * 0.4:
87
+ selected.append((node, score))
88
+
89
+ return [node for node, score in selected]
table_prep.py CHANGED
@@ -4,7 +4,6 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
4
  from my_logging import log_message
5
 
6
  def create_table_content(table_data):
7
- """Create formatted content from table data"""
8
  doc_id = (
9
  table_data.get('document_id') or
10
  table_data.get('document') or
@@ -19,55 +18,34 @@ def create_table_content(table_data):
19
  'Неизвестно'
20
  )
21
 
22
- # FIXED: Normalize table number and create variations
23
- table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
 
 
 
24
 
25
- # FIXED: Enhanced content with multiple references for better matching
26
- content = f"Документ: {doc_id}\n"
27
- content += f"ГОСТ/Стандарт: {doc_id}\n"
28
- content += f"Таблица номер: {table_num}\n"
29
- content += f"Таблица: {table_num_clean}\n"
30
- content += f"Название таблицы: {table_title}\n"
31
- content += f"Раздел документа: {section}\n"
32
-
33
- # FIXED: Add explicit appendix reference if present
34
- if 'приложени' in section.lower():
35
- appendix_match = section.lower().split('приложени')[1].split()[0] if 'приложени' in section.lower() else ''
36
- content += f"Таблица {table_num_clean} Приложения {appendix_match}\n"
37
 
38
  headers = table_data.get('headers', [])
39
  if headers:
40
- # FIXED: Add headers as searchable keywords
41
- headers_text = ' | '.join(str(h) for h in headers)
42
- content += f"\nЗаголовки колонок: {headers_text}\n"
43
- content += f"Параметры: {headers_text}\n" # Alternative keyword
44
 
45
- # FIXED: Extract and emphasize key data values for better semantic search
46
  if 'data' in table_data and isinstance(table_data['data'], list):
47
- content += "\nСодержимое таблицы:\n"
48
- # Extract unique values for search enhancement
49
- all_values = set()
50
-
51
  for row_idx, row in enumerate(table_data['data'], start=1):
52
  if isinstance(row, dict):
53
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
54
- content += f"Строка {row_idx}: {row_text}\n"
55
- # Collect values
56
- all_values.update([str(v) for v in row.values() if v and str(v).strip()])
57
  elif isinstance(row, list):
58
- row_text = " | ".join([str(v) for v in row if v])
59
- content += f"Строка {row_idx}: {row_text}\n"
60
- all_values.update([str(v) for v in row if v and str(v).strip()])
61
-
62
- # FIXED: Add searchable keywords from data
63
- if all_values:
64
- content += f"\nКлючевые значения: {' '.join(list(all_values)[:50])}\n"
65
 
66
  return content
67
 
68
 
69
  def table_to_document(table_data, document_id=None):
70
- """Convert table data to Document, with smart chunking if needed"""
71
  if not isinstance(table_data, dict):
72
  return []
73
 
@@ -79,72 +57,39 @@ def table_to_document(table_data, document_id=None):
79
  'Неизвестно'
80
  )
81
 
 
 
 
82
  table_num = table_data.get('table_number', 'Неизвестно')
83
- table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
84
  table_title = table_data.get('table_title', 'Неизвестно')
85
-
86
  section = (
87
  table_data.get('section') or
88
- table_data.get('Раздел документа') or
89
- table_data.get('section_id') or
90
  'Неизвестно'
91
  )
92
 
93
  table_rows = table_data.get('data', [])
94
  if not table_rows:
95
- log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
96
  return []
97
 
98
  content = create_table_content(table_data)
99
- content_size = len(content)
100
-
101
- # FIXED: Extract appendix info for better identification
102
- appendix_num = None
103
- if 'приложени' in section.lower():
104
- import re
105
- match = re.search(r'приложени[ея]\s*(\d+)', section.lower())
106
- if match:
107
- appendix_num = match.group(1)
108
-
109
- # FIXED: Create comprehensive search variations
110
- search_variations = [
111
- f"{doc_id} таблица {table_num_clean}",
112
- f"{doc_id} {table_num}",
113
- f"таблица {table_num_clean} {doc_id}",
114
- table_title.lower(),
115
- section.lower()
116
- ]
117
-
118
- if appendix_num:
119
- search_variations.extend([
120
- f"таблица {table_num_clean} приложения {appendix_num}",
121
- f"приложение {appendix_num} таблица {table_num_clean}"
122
- ])
123
 
124
  base_doc = Document(
125
  text=content,
126
  metadata={
127
  "type": "table",
128
  "table_number": str(table_num),
129
- "table_number_clean": str(table_num_clean), # FIXED: Add normalized version
130
  "table_title": str(table_title),
131
  "document_id": str(doc_id),
132
  "section": str(section),
133
- "section_id": str(section),
134
- "appendix_number": str(appendix_num) if appendix_num else None, # FIXED: Add appendix tracking
135
  "total_rows": len(table_rows),
136
- "content_size": content_size,
137
- "search_key": " | ".join(search_variations), # FIXED: Enhanced search key
138
- "headers": " ".join(str(h) for h in table_data.get('headers', [])) # FIXED: Add headers as metadata
139
  }
140
  )
141
 
142
- # Apply smart chunking if too large
143
- if content_size > CHUNK_SIZE:
144
- log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
145
  return chunk_table_document(base_doc)
146
  else:
147
- log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов, doc_id={doc_id})")
148
  return [base_doc]
149
 
150
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
@@ -230,4 +175,51 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
230
  )
231
  chunked_docs.append(chunked_doc)
232
 
233
- return chunked_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from my_logging import log_message
5
 
6
  def create_table_content(table_data):
 
7
  doc_id = (
8
  table_data.get('document_id') or
9
  table_data.get('document') or
 
18
  'Неизвестно'
19
  )
20
 
21
+ content = f"ГОСТ {doc_id} Стандарт {doc_id}\n"
22
+ content += f"Документ: {doc_id}\n"
23
+ content += f"Таблица {table_num}\n"
24
+ content += f"Название: {table_title}\n"
25
+ content += f"Раздел: {section}\n"
26
 
27
+ if 'Приложени' in section:
28
+ content += f"Приложение таблица {table_num}\n"
 
 
 
 
 
 
 
 
 
 
29
 
30
  headers = table_data.get('headers', [])
31
  if headers:
32
+ content += f"\nКолонки: {' | '.join(str(h) for h in headers)}\n"
 
 
 
33
 
 
34
  if 'data' in table_data and isinstance(table_data['data'], list):
35
+ content += "\nДанные:\n"
 
 
 
36
  for row_idx, row in enumerate(table_data['data'], start=1):
37
  if isinstance(row, dict):
38
+ for k, v in row.items():
39
+ if v and str(v).strip():
40
+ content += f"{k} {v} "
41
+ content += "\n"
42
  elif isinstance(row, list):
43
+ content += " ".join([str(v) for v in row if v]) + "\n"
 
 
 
 
 
 
44
 
45
  return content
46
 
47
 
48
  def table_to_document(table_data, document_id=None):
 
49
  if not isinstance(table_data, dict):
50
  return []
51
 
 
57
  'Неизвестно'
58
  )
59
 
60
+ if 'НП-104-18' in str(table_data.get('document', '')):
61
+ doc_id = 'ГОСТ 59023'
62
+
63
  table_num = table_data.get('table_number', 'Неизвестно')
 
64
  table_title = table_data.get('table_title', 'Неизвестно')
 
65
  section = (
66
  table_data.get('section') or
67
+ table_data.get('Раздел документа') or
 
68
  'Неизвестно'
69
  )
70
 
71
  table_rows = table_data.get('data', [])
72
  if not table_rows:
 
73
  return []
74
 
75
  content = create_table_content(table_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  base_doc = Document(
78
  text=content,
79
  metadata={
80
  "type": "table",
81
  "table_number": str(table_num),
 
82
  "table_title": str(table_title),
83
  "document_id": str(doc_id),
84
  "section": str(section),
 
 
85
  "total_rows": len(table_rows),
86
+ "content_size": len(content)
 
 
87
  }
88
  )
89
 
90
+ if len(content) > CHUNK_SIZE:
 
 
91
  return chunk_table_document(base_doc)
92
  else:
 
93
  return [base_doc]
94
 
95
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
 
175
  )
176
  chunked_docs.append(chunked_doc)
177
 
178
+ return chunked_docs
179
+
180
+ def table_to_document(table_data, document_id=None):
181
+ if not isinstance(table_data, dict):
182
+ return []
183
+
184
+ doc_id = (
185
+ document_id or
186
+ table_data.get('document_id') or
187
+ table_data.get('document') or
188
+ table_data.get('Обозначение документа') or
189
+ 'Неизвестно'
190
+ )
191
+
192
+ if 'НП-104-18' in str(table_data.get('document', '')):
193
+ doc_id = 'ГОСТ 59023'
194
+
195
+ table_num = table_data.get('table_number', 'Неизвестно')
196
+ table_title = table_data.get('table_title', 'Неизвестно')
197
+ section = (
198
+ table_data.get('section') or
199
+ table_data.get('Раздел документа') or
200
+ 'Неизвестно'
201
+ )
202
+
203
+ table_rows = table_data.get('data', [])
204
+ if not table_rows:
205
+ return []
206
+
207
+ content = create_table_content(table_data)
208
+
209
+ base_doc = Document(
210
+ text=content,
211
+ metadata={
212
+ "type": "table",
213
+ "table_number": str(table_num),
214
+ "table_title": str(table_title),
215
+ "document_id": str(doc_id),
216
+ "section": str(section),
217
+ "total_rows": len(table_rows),
218
+ "content_size": len(content)
219
+ }
220
+ )
221
+
222
+ if len(content) > CHUNK_SIZE:
223
+ return chunk_table_document(base_doc)
224
+ else:
225
+ return [base_doc]