MrSimple07 commited on
Commit
0067c9d
·
1 Parent(s): 7dcc6c5

Removed all custom table configurations (CUSTOM_TABLE_CONFIGS)

Browse files
Files changed (2) hide show
  1. documents_prep.py +18 -38
  2. table_prep.py +81 -179
documents_prep.py CHANGED
@@ -44,7 +44,6 @@ def process_documents_with_chunking(documents):
44
  all_chunked_docs = []
45
  chunk_info = []
46
  table_count = 0
47
- table_added_count = 0
48
  image_count = 0
49
  text_chunks_count = 0
50
 
@@ -52,44 +51,19 @@ def process_documents_with_chunking(documents):
52
  doc_type = doc.metadata.get('type', 'text')
53
 
54
  if doc_type == 'table':
 
55
  table_count += 1
56
- doc_size = len(doc.text)
57
-
58
- # Log table size
59
- table_num = doc.metadata.get('table_number', 'unknown')
60
- doc_id = doc.metadata.get('document_id', 'unknown')
61
- log_message(f"Таблица {table_num} в документе {doc_id}: размер {doc_size} символов")
62
 
63
- # Always add table regardless of size or custom config
64
- if doc_size > CHUNK_SIZE:
65
- chunked_docs = chunk_document(doc)
66
- all_chunked_docs.extend(chunked_docs)
67
- table_added_count += len(chunked_docs)
68
-
69
- for i, chunk_doc in enumerate(chunked_docs):
70
- chunk_info.append({
71
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
72
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
73
- 'chunk_id': i,
74
- 'chunk_size': len(chunk_doc.text),
75
- 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
76
- 'type': 'table',
77
- 'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
78
- 'processing_method': 'chunked'
79
- })
80
- else:
81
- all_chunked_docs.append(doc)
82
- table_added_count += 1
83
- chunk_info.append({
84
- 'document_id': doc.metadata.get('document_id', 'unknown'),
85
- 'section_id': doc.metadata.get('section_id', 'unknown'),
86
- 'chunk_id': 0,
87
- 'chunk_size': doc_size,
88
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
89
- 'type': 'table',
90
- 'table_number': doc.metadata.get('table_number', 'unknown'),
91
- 'processing_method': 'standard'
92
- })
93
 
94
  elif doc_type == 'image':
95
  image_count += 1
@@ -147,7 +121,13 @@ def process_documents_with_chunking(documents):
147
  'type': 'text'
148
  })
149
 
150
- log_message(f"Таблицы: всего {table_count}, добавлено {table_added_count}, Изображения: {image_count}, Текстовые чанки: {text_chunks_count}, Итого: {len(all_chunked_docs)}")
 
 
 
 
 
 
151
 
152
  return all_chunked_docs, chunk_info
153
 
 
44
  all_chunked_docs = []
45
  chunk_info = []
46
  table_count = 0
 
47
  image_count = 0
48
  text_chunks_count = 0
49
 
 
51
  doc_type = doc.metadata.get('type', 'text')
52
 
53
  if doc_type == 'table':
54
+ # Add tables as-is, no chunking
55
  table_count += 1
56
+ all_chunked_docs.append(doc)
 
 
 
 
 
57
 
58
+ chunk_info.append({
59
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
60
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
61
+ 'chunk_id': 0,
62
+ 'chunk_size': len(doc.text),
63
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
64
+ 'type': 'table',
65
+ 'table_number': doc.metadata.get('table_number', 'unknown')
66
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  elif doc_type == 'image':
69
  image_count += 1
 
121
  'type': 'text'
122
  })
123
 
124
+ log_message(f"\n{'='*60}")
125
+ log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
126
+ log_message(f" • Таблицы: {table_count} (добавлены целиком)")
127
+ log_message(f" • Изображения: {image_count}")
128
+ log_message(f" • Текстовые чанки: {text_chunks_count}")
129
+ log_message(f" • Всего документов: {len(all_chunked_docs)}")
130
+ log_message(f"{'='*60}\n")
131
 
132
  return all_chunked_docs, chunk_info
133
 
table_prep.py CHANGED
@@ -4,156 +4,33 @@ from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
- CUSTOM_TABLE_CONFIGS = {
8
- "ГОСТ Р 50.05.01-2018": {
9
- "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
10
- "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
11
- },
12
- "ГОСТ Р 50.06.01-2017": {"№ Б.2": {"method": "split_by_rows"}},
13
- "НП-104-18": {"*": {"method": "group_entire_table"}},
14
- "НП-068-05": {
15
- "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
16
- "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
17
- "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
18
- },
19
- "ГОСТ Р 59023.1-2020": {
20
- "№ 1": {"method": "split_by_rows"},
21
- "№ 2": {"method": "split_by_rows"},
22
- "№ 3": {"method": "split_by_rows"}
23
- },
24
- "НП-089-15": {"-": {"method": "split_by_rows"}},
25
- "НП-105-18": {"№ 4.8": {"method": "group_entire_table"}},
26
- "ГОСТ Р 50.05.23-2020": {"№8": {"method": "group_entire_table"}},
27
- "ГОСТ Р 50.03.01-2017": {"А.8": {"method": "group_entire_table"}}
28
- }
29
-
30
- def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
31
- base_info = f'Документ "{document_name}", Раздел: {section}, Таблица: {table_number}'
32
- if table_title and table_title.strip():
33
- base_info += f', Название: {table_title}'
34
- if extra_info:
35
- base_info += f', {extra_info}'
36
- return base_info
37
-
38
- def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
39
- chunk_lines = [meta_info.rstrip()]
40
- chunk_lines.append("Заголовки: " + " | ".join(headers))
41
-
42
- for i, row in enumerate(rows, start=1):
43
- row_parts = [f"{h}: {row.get(h, '')}" for h in headers if row.get(h, '')]
44
- if add_row_numbers:
45
- chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
46
- else:
47
- chunk_lines.append(' | '.join(row_parts))
48
-
49
- return "\n".join(chunk_lines)
50
-
51
- def get_custom_config(document_id, table_number):
52
- for doc_pattern, tables_config in CUSTOM_TABLE_CONFIGS.items():
53
- if document_id.startswith(doc_pattern):
54
- return tables_config.get(table_number, tables_config.get("*"))
55
- return None
56
-
57
- def group_by_column_method(table_data, document_name, group_column):
58
- documents = []
59
- headers = table_data.get("headers", [])
60
- rows = table_data.get("data", [])
61
- section = table_data.get("section", "")
62
- table_number = table_data.get("table_number", "")
63
- table_title = table_data.get("table_title", "")
64
-
65
- grouped = defaultdict(list)
66
- for row in rows:
67
- grouped[row.get(group_column, "UNKNOWN")].append(row)
68
-
69
- for group_value, group_rows in grouped.items():
70
- meta_info = create_meta_info(document_name, section, table_number, table_title,
71
- f'Группа по "{group_column}": {group_value}')
72
- chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
73
-
74
- documents.append(Document(
75
- text=chunk_text,
76
- metadata={
77
- "type": "table",
78
- "table_number": table_number,
79
- "table_title": table_title,
80
- "document_id": document_name,
81
- "section": section,
82
- "section_id": section,
83
- "group_column": group_column,
84
- "group_value": group_value,
85
- "total_rows": len(group_rows),
86
- "processing_method": "group_by_column"
87
- }
88
- ))
89
-
90
- return documents
91
-
92
- def split_by_rows_method(table_data, document_name):
93
- documents = []
94
- headers = table_data.get("headers", [])
95
- rows = table_data.get("data", [])
96
- section = table_data.get("section", "")
97
- table_number = table_data.get("table_number", "")
98
- table_title = table_data.get("table_title", "")
99
-
100
- for i, row in enumerate(rows, start=1):
101
- meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
102
- chunk_text = create_chunk_text(meta_info, headers, [row])
103
-
104
- documents.append(Document(
105
- text=chunk_text,
106
- metadata={
107
- "type": "table",
108
- "table_number": table_number,
109
- "table_title": table_title,
110
- "document_id": document_name,
111
- "section": section,
112
- "section_id": section,
113
- "row_number": i,
114
- "total_rows": len(rows),
115
- "processing_method": "split_by_rows"
116
- }
117
- ))
118
 
119
- return documents
120
-
121
- def group_entire_table_method(table_data, document_name):
122
- headers = table_data.get("headers", [])
123
- rows = table_data.get("data", [])
124
- section = table_data.get("section", "")
125
- table_number = table_data.get("table_number", "")
126
- table_title = table_data.get("table_title", "")
127
 
128
- meta_info = create_meta_info(document_name, section, table_number, table_title)
129
- chunk_text = create_chunk_text(meta_info, headers, rows)
 
130
 
131
- return [Document(
132
- text=chunk_text,
133
- metadata={
134
- "type": "table",
135
- "table_number": table_number,
136
- "table_title": table_title,
137
- "document_id": document_name,
138
- "section": section,
139
- "section_id": section,
140
- "total_rows": len(rows),
141
- "processing_method": "group_entire_table"
142
- }
143
- )]
144
-
145
- def process_table(table_data, document_name, method_config):
146
- method = method_config.get("method")
147
 
148
- if method == "group_by_column":
149
- return group_by_column_method(table_data, document_name, method_config.get("group_column"))
150
- elif method == "split_by_rows":
151
- return split_by_rows_method(table_data, document_name)
152
- elif method == "group_entire_table":
153
- return group_entire_table_method(table_data, document_name)
154
- return None
155
 
156
  def table_to_document(table_data, document_id=None):
 
157
  if not isinstance(table_data, dict):
158
  return []
159
 
@@ -162,39 +39,16 @@ def table_to_document(table_data, document_id=None):
162
  table_title = table_data.get('table_title', 'Неизвестно')
163
  section = table_data.get('section', 'Неизвестно')
164
 
165
- method_config = get_custom_config(doc_id, table_num)
 
166
 
167
- if method_config:
168
- log_message(f"✓ Таблица {table_num} '{table_title}' в документе {doc_id}: метод {method_config['method']}")
169
- custom_docs = process_table(table_data, doc_id, method_config)
170
- if custom_docs:
171
- return custom_docs
172
-
173
- header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
174
-
175
- if 'data' in table_data and isinstance(table_data['data'], list):
176
- table_content = header_content + "\nДанные таблицы:\n"
177
- for row_idx, row in enumerate(table_data['data']):
178
- if isinstance(row, dict):
179
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
180
- table_content += f"Строка {row_idx + 1}: {row_text}\n"
181
-
182
- return [Document(
183
- text=table_content,
184
- metadata={
185
- "type": "table",
186
- "table_number": table_num,
187
- "table_title": table_title,
188
- "document_id": doc_id,
189
- "section": section,
190
- "section_id": section,
191
- "total_rows": len(table_data['data']),
192
- "processing_method": "default"
193
- }
194
- )]
195
 
196
  return [Document(
197
- text=header_content,
198
  metadata={
199
  "type": "table",
200
  "table_number": table_num,
@@ -202,12 +56,15 @@ def table_to_document(table_data, document_id=None):
202
  "document_id": doc_id,
203
  "section": section,
204
  "section_id": section,
205
- "processing_method": "default"
 
206
  }
207
  )]
208
 
209
  def load_table_data(repo_id, hf_token, table_data_dir):
210
- log_message("Загрузка табличных данных")
 
 
211
 
212
  try:
213
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -216,6 +73,12 @@ def load_table_data(repo_id, hf_token, table_data_dir):
216
  log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
217
 
218
  table_documents = []
 
 
 
 
 
 
219
  for file_path in table_files:
220
  try:
221
  local_path = hf_hub_download(
@@ -226,6 +89,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
226
  token=hf_token
227
  )
228
 
 
 
229
  with open(local_path, 'r', encoding='utf-8') as f:
230
  table_data = json.load(f)
231
 
@@ -237,21 +102,58 @@ def load_table_data(repo_id, hf_token, table_data_dir):
237
  sheet['document'] = document_id
238
  docs_list = table_to_document(sheet, document_id)
239
  table_documents.extend(docs_list)
 
 
 
 
 
 
 
240
  else:
241
  docs_list = table_to_document(table_data, document_id)
242
  table_documents.extend(docs_list)
 
 
 
 
 
 
 
 
243
  elif isinstance(table_data, list):
244
  for table_json in table_data:
245
  docs_list = table_to_document(table_json)
246
  table_documents.extend(docs_list)
 
 
 
 
 
 
 
 
247
 
248
  except Exception as e:
249
- log_message(f"Ошибка файла {file_path}: {str(e)}")
250
  continue
251
 
252
- log_message(f"✓✓✓ ИТОГО создано {len(table_documents)} табличных документов (до chunking)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  return table_documents
254
 
255
  except Exception as e:
256
- log_message(f"Ошибка загрузки табличных данных: {str(e)}")
257
  return []
 
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
+ def create_table_content(table_data):
8
+ """Create formatted content from table data"""
9
+ doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
10
+ table_num = table_data.get('table_number', 'Неизвестно')
11
+ table_title = table_data.get('table_title', 'Неизвестно')
12
+ section = table_data.get('section', 'Неизвестно')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ content = f"Таблица: {table_num}\n"
15
+ content += f"Название: {table_title}\n"
16
+ content += f"Документ: {doc_id}\n"
17
+ content += f"Раздел: {section}\n"
 
 
 
 
18
 
19
+ headers = table_data.get('headers', [])
20
+ if headers:
21
+ content += f"\nЗаголовки: {' | '.join(headers)}\n"
22
 
23
+ if 'data' in table_data and isinstance(table_data['data'], list):
24
+ content += "\nДанные таблицы:\n"
25
+ for row_idx, row in enumerate(table_data['data'], start=1):
26
+ if isinstance(row, dict):
27
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
28
+ content += f"Строка {row_idx}: {row_text}\n"
 
 
 
 
 
 
 
 
 
 
29
 
30
+ return content
 
 
 
 
 
 
31
 
32
  def table_to_document(table_data, document_id=None):
33
+ """Convert table data to a single Document"""
34
  if not isinstance(table_data, dict):
35
  return []
36
 
 
39
  table_title = table_data.get('table_title', 'Неизвестно')
40
  section = table_data.get('section', 'Неизвестно')
41
 
42
+ content = create_table_content(table_data)
43
+ content_size = len(content)
44
 
45
+ # Log table addition
46
+ row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
47
+ log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
48
+ f"Размер: {content_size} символов | Строк: {row_count}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  return [Document(
51
+ text=content,
52
  metadata={
53
  "type": "table",
54
  "table_number": table_num,
 
56
  "document_id": doc_id,
57
  "section": section,
58
  "section_id": section,
59
+ "total_rows": row_count,
60
+ "content_size": content_size
61
  }
62
  )]
63
 
64
  def load_table_data(repo_id, hf_token, table_data_dir):
65
+ log_message("=" * 60)
66
+ log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
67
+ log_message("=" * 60)
68
 
69
  try:
70
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
 
73
  log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
74
 
75
  table_documents = []
76
+ stats = {
77
+ 'total_tables': 0,
78
+ 'total_size': 0,
79
+ 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
80
+ }
81
+
82
  for file_path in table_files:
83
  try:
84
  local_path = hf_hub_download(
 
89
  token=hf_token
90
  )
91
 
92
+ log_message(f"\nОбработка файла: {file_path}")
93
+
94
  with open(local_path, 'r', encoding='utf-8') as f:
95
  table_data = json.load(f)
96
 
 
102
  sheet['document'] = document_id
103
  docs_list = table_to_document(sheet, document_id)
104
  table_documents.extend(docs_list)
105
+
106
+ for doc in docs_list:
107
+ stats['total_tables'] += 1
108
+ size = doc.metadata.get('content_size', 0)
109
+ stats['total_size'] += size
110
+ stats['by_document'][document_id]['count'] += 1
111
+ stats['by_document'][document_id]['size'] += size
112
  else:
113
  docs_list = table_to_document(table_data, document_id)
114
  table_documents.extend(docs_list)
115
+
116
+ for doc in docs_list:
117
+ stats['total_tables'] += 1
118
+ size = doc.metadata.get('content_size', 0)
119
+ stats['total_size'] += size
120
+ stats['by_document'][document_id]['count'] += 1
121
+ stats['by_document'][document_id]['size'] += size
122
+
123
  elif isinstance(table_data, list):
124
  for table_json in table_data:
125
  docs_list = table_to_document(table_json)
126
  table_documents.extend(docs_list)
127
+
128
+ for doc in docs_list:
129
+ doc_id = doc.metadata.get('document_id', 'unknown')
130
+ stats['total_tables'] += 1
131
+ size = doc.metadata.get('content_size', 0)
132
+ stats['total_size'] += size
133
+ stats['by_document'][doc_id]['count'] += 1
134
+ stats['by_document'][doc_id]['size'] += size
135
 
136
  except Exception as e:
137
+ log_message(f" ОШИБКА файла {file_path}: {str(e)}")
138
  continue
139
 
140
+ # Log summary statistics
141
+ log_message("\n" + "=" * 60)
142
+ log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
143
+ log_message("=" * 60)
144
+ log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
145
+ log_message(f"Общий размер: {stats['total_size']:,} символов")
146
+ log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
147
+
148
+ log_message("\nПо документам:")
149
+ for doc_id, doc_stats in sorted(stats['by_document'].items()):
150
+ log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
151
+ f"{doc_stats['size']:,} символов")
152
+
153
+ log_message("=" * 60)
154
+
155
  return table_documents
156
 
157
  except Exception as e:
158
+ log_message(f" КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
159
  return []