MrSimple07 commited on
Commit
c81fd8c
·
1 Parent(s): 5c2023e

Removed duplicate logs throughout all files

Browse files
Files changed (3) hide show
  1. documents_prep.py +5 -25
  2. table_prep.py +86 -176
  3. utils.py +16 -34
documents_prep.py CHANGED
@@ -46,8 +46,6 @@ def process_documents_with_chunking(documents):
46
  table_count = 0
47
  image_count = 0
48
  text_chunks_count = 0
49
- large_tables_count = 0
50
- large_images_count = 0
51
  custom_processed_count = 0
52
 
53
  for doc in documents:
@@ -57,13 +55,11 @@ def process_documents_with_chunking(documents):
57
  table_count += 1
58
  doc_id = doc.metadata.get('document_id', 'unknown')
59
  table_num = doc.metadata.get('table_number', 'unknown')
60
- from table_prep import should_use_custom_processing
61
- use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
62
 
63
- if use_custom:
64
  custom_processed_count += 1
65
- log_message(f"Table {table_num} in document {doc_id} was processed with custom method '{method_config.get('method')}', skipping standard chunking")
66
- # Add the document as-is since it was already processed by custom method
67
  all_chunked_docs.append(doc)
68
  chunk_info.append({
69
  'document_id': doc_id,
@@ -77,13 +73,8 @@ def process_documents_with_chunking(documents):
77
  })
78
  continue
79
 
80
- # Standard processing for non-custom tables
81
  doc_size = len(doc.text)
82
  if doc_size > CHUNK_SIZE:
83
- large_tables_count += 1
84
- log_message(f"Large table found: {table_num} in document {doc_id}, size: {doc_size} characters")
85
-
86
- # Chunk large tables
87
  chunked_docs = chunk_document(doc)
88
  all_chunked_docs.extend(chunked_docs)
89
 
@@ -115,10 +106,6 @@ def process_documents_with_chunking(documents):
115
  image_count += 1
116
  doc_size = len(doc.text)
117
  if doc_size > CHUNK_SIZE:
118
- large_images_count += 1
119
- log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
120
-
121
- # Chunk large images
122
  chunked_docs = chunk_document(doc)
123
  all_chunked_docs.extend(chunked_docs)
124
 
@@ -144,7 +131,7 @@ def process_documents_with_chunking(documents):
144
  'image_number': doc.metadata.get('image_number', 'unknown')
145
  })
146
 
147
- else: # text documents
148
  doc_size = len(doc.text)
149
  if doc_size > CHUNK_SIZE:
150
  chunked_docs = chunk_document(doc)
@@ -171,14 +158,7 @@ def process_documents_with_chunking(documents):
171
  'type': 'text'
172
  })
173
 
174
- log_message(f"=== PROCESSING STATISTICS ===")
175
- log_message(f"Total tables processed: {table_count}")
176
- log_message(f"Custom processed tables: {custom_processed_count}")
177
- log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
178
- log_message(f"Total images processed: {image_count}")
179
- log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
180
- log_message(f"Total text chunks created: {text_chunks_count}")
181
- log_message(f"Total documents after processing: {len(all_chunked_docs)}")
182
 
183
  return all_chunked_docs, chunk_info
184
 
 
46
  table_count = 0
47
  image_count = 0
48
  text_chunks_count = 0
 
 
49
  custom_processed_count = 0
50
 
51
  for doc in documents:
 
55
  table_count += 1
56
  doc_id = doc.metadata.get('document_id', 'unknown')
57
  table_num = doc.metadata.get('table_number', 'unknown')
58
+ from table_prep import get_custom_config
59
+ method_config = get_custom_config(doc_id, table_num)
60
 
61
+ if method_config:
62
  custom_processed_count += 1
 
 
63
  all_chunked_docs.append(doc)
64
  chunk_info.append({
65
  'document_id': doc_id,
 
73
  })
74
  continue
75
 
 
76
  doc_size = len(doc.text)
77
  if doc_size > CHUNK_SIZE:
 
 
 
 
78
  chunked_docs = chunk_document(doc)
79
  all_chunked_docs.extend(chunked_docs)
80
 
 
106
  image_count += 1
107
  doc_size = len(doc.text)
108
  if doc_size > CHUNK_SIZE:
 
 
 
 
109
  chunked_docs = chunk_document(doc)
110
  all_chunked_docs.extend(chunked_docs)
111
 
 
131
  'image_number': doc.metadata.get('image_number', 'unknown')
132
  })
133
 
134
+ else:
135
  doc_size = len(doc.text)
136
  if doc_size > CHUNK_SIZE:
137
  chunked_docs = chunk_document(doc)
 
158
  'type': 'text'
159
  })
160
 
161
+ log_message(f"Таблицы: {table_count} (кастомных: {custom_processed_count}), Изображения: {image_count}, Текстовые чанки: {text_chunks_count}, Итого: {len(all_chunked_docs)}")
 
 
 
 
 
 
 
162
 
163
  return all_chunked_docs, chunk_info
164
 
table_prep.py CHANGED
@@ -1,63 +1,30 @@
1
- import os
2
  from collections import defaultdict
3
  import json
4
- import zipfile
5
- import pandas as pd
6
  from huggingface_hub import hf_hub_download, list_repo_files
7
  from llama_index.core import Document
8
  from my_logging import log_message
9
 
10
  CUSTOM_TABLE_CONFIGS = {
11
  "ГОСТ Р 50.05.01-2018": {
12
- "tables": {
13
- "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
14
- "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
15
- }
16
- },
17
- "ГОСТ Р 50.06.01-2017": {
18
- "tables": {
19
- "№ Б.2": {"method": "split_by_rows"}
20
- }
21
- },
22
- "НП-104-18": {
23
- "tables": {
24
- "*": {"method": "group_entire_table"} # All tables
25
- }
26
  },
 
 
27
  "НП-068-05": {
28
- "tables": {
29
- "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
30
- "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
31
- "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
32
- }
33
  },
34
  "ГОСТ Р 59023.1-2020": {
35
- "tables": {
36
- "№ 1": {"method": "split_by_rows"},
37
- "№ 2": {"method": "split_by_rows"},
38
- "№ 3": {"method": "split_by_rows"}
39
- }
40
- },
41
- "НП-089-15": {
42
- "tables": {
43
- "-": {"method": "split_by_rows"}
44
- }
45
  },
46
- "НП-105-18": {
47
- "tables": {
48
- " 4.8": {"method": "group_entire_table"}
49
- }
50
- },
51
- "ГОСТ Р 50.05.23-2020": {
52
- "tables": {
53
- "№8": {"method": "group_entire_table"}
54
- }
55
- },
56
- "ГОСТ Р 50.03.01-2017": {
57
- "tables": {
58
- "А.8": {"method": "group_entire_table"}
59
- }
60
- }
61
  }
62
 
63
  def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
@@ -69,28 +36,25 @@ def create_meta_info(document_name, section, table_number, table_title, extra_in
69
  return base_info
70
 
71
  def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
72
- chunk_lines = [meta_info.rstrip()] # Remove trailing newline from meta_info
73
-
74
- # Add headers only once
75
- header_line = " | ".join(headers)
76
- chunk_lines.append(f"Заголовки: {header_line}")
77
 
78
- # Add rows without redundant formatting
79
  for i, row in enumerate(rows, start=1):
80
- row_parts = []
81
- for h in headers:
82
- value = row.get(h, '')
83
- if value: # Only add non-empty values
84
- row_parts.append(f"{h}: {value}")
85
-
86
  if add_row_numbers:
87
  chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
88
  else:
89
  chunk_lines.append(' | '.join(row_parts))
90
 
91
  return "\n".join(chunk_lines)
 
 
 
 
 
 
 
92
  def group_by_column_method(table_data, document_name, group_column):
93
- """Group rows by specified column value"""
94
  documents = []
95
  headers = table_data.get("headers", [])
96
  rows = table_data.get("data", [])
@@ -100,16 +64,14 @@ def group_by_column_method(table_data, document_name, group_column):
100
 
101
  grouped = defaultdict(list)
102
  for row in rows:
103
- key = row.get(group_column, "UNKNOWN")
104
- grouped[key].append(row)
105
 
106
  for group_value, group_rows in grouped.items():
107
  meta_info = create_meta_info(document_name, section, table_number, table_title,
108
  f'Группа по "{group_column}": {group_value}')
109
-
110
  chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
111
 
112
- doc = Document(
113
  text=chunk_text,
114
  metadata={
115
  "type": "table",
@@ -123,14 +85,11 @@ def group_by_column_method(table_data, document_name, group_column):
123
  "total_rows": len(group_rows),
124
  "processing_method": "group_by_column"
125
  }
126
- )
127
- documents.append(doc)
128
- log_message(f"Created grouped chunk for {group_column}={group_value}, rows: {len(group_rows)}, length: {len(chunk_text)}")
129
 
130
  return documents
131
 
132
  def split_by_rows_method(table_data, document_name):
133
- """Split table into individual row chunks"""
134
  documents = []
135
  headers = table_data.get("headers", [])
136
  rows = table_data.get("data", [])
@@ -140,10 +99,9 @@ def split_by_rows_method(table_data, document_name):
140
 
141
  for i, row in enumerate(rows, start=1):
142
  meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
143
-
144
  chunk_text = create_chunk_text(meta_info, headers, [row])
145
 
146
- doc = Document(
147
  text=chunk_text,
148
  metadata={
149
  "type": "table",
@@ -156,14 +114,11 @@ def split_by_rows_method(table_data, document_name):
156
  "total_rows": len(rows),
157
  "processing_method": "split_by_rows"
158
  }
159
- )
160
- documents.append(doc)
161
 
162
- log_message(f"Split table {table_number} into {len(rows)} row chunks")
163
  return documents
164
 
165
  def group_entire_table_method(table_data, document_name):
166
- """Group entire table as one chunk"""
167
  headers = table_data.get("headers", [])
168
  rows = table_data.get("data", [])
169
  section = table_data.get("section", "")
@@ -173,7 +128,7 @@ def group_entire_table_method(table_data, document_name):
173
  meta_info = create_meta_info(document_name, section, table_number, table_title)
174
  chunk_text = create_chunk_text(meta_info, headers, rows)
175
 
176
- doc = Document(
177
  text=chunk_text,
178
  metadata={
179
  "type": "table",
@@ -185,108 +140,84 @@ def group_entire_table_method(table_data, document_name):
185
  "total_rows": len(rows),
186
  "processing_method": "group_entire_table"
187
  }
188
- )
189
-
190
- log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
191
- return [doc]
192
 
193
- def should_use_custom_processing(document_id, table_number):
194
- """Check if table should use custom processing"""
195
- for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
196
- if document_id.startswith(doc_pattern):
197
- tables_config = config.get("tables", {})
198
- if table_number in tables_config or "*" in tables_config:
199
- return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
200
- return False, None, None
201
-
202
- def process_table_with_custom_method(table_data, document_name, method_config):
203
- """Process table using custom method"""
204
  method = method_config.get("method")
205
 
206
  if method == "group_by_column":
207
- group_column = method_config.get("group_column")
208
- return group_by_column_method(table_data, document_name, group_column)
209
  elif method == "split_by_rows":
210
  return split_by_rows_method(table_data, document_name)
211
  elif method == "group_entire_table":
212
  return group_entire_table_method(table_data, document_name)
213
- else:
214
- log_message(f"Unknown custom method: {method}, falling back to default processing")
215
- return None
216
 
217
  def table_to_document(table_data, document_id=None):
218
- if isinstance(table_data, dict):
219
- doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
220
- table_num = table_data.get('table_number', 'Неизвестно')
221
- use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
222
-
223
- if use_custom:
224
- log_message(f"Using custom processing for table {table_num} in document {doc_id}")
225
- custom_docs = process_table_with_custom_method(table_data, doc_id, method_config)
226
- if custom_docs:
227
- return custom_docs
228
-
229
- # DEFAULT PROCESSING (only if NOT using custom)
230
- table_title = table_data.get('table_title', 'Неизвестно')
231
- section = table_data.get('section', 'Неизвестно')
232
-
233
- header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
 
 
 
 
 
 
 
 
234
 
235
- if 'data' in table_data and isinstance(table_data['data'], list):
236
- table_content = header_content + "\nДанные таблицы:\n"
237
- for row_idx, row in enumerate(table_data['data']):
238
- if isinstance(row, dict):
239
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
240
- table_content += f"Строка {row_idx + 1}: {row_text}\n"
241
-
242
- doc = Document(
243
- text=table_content,
244
- metadata={
245
- "type": "table",
246
- "table_number": table_num,
247
- "table_title": table_title,
248
- "document_id": doc_id,
249
- "section": section,
250
- "section_id": section,
251
- "total_rows": len(table_data['data']),
252
- "processing_method": "default"
253
- }
254
- )
255
- return [doc]
256
- else:
257
- doc = Document(
258
- text=header_content,
259
- metadata={
260
- "type": "table",
261
- "table_number": table_num,
262
- "table_title": table_title,
263
- "document_id": doc_id,
264
- "section": section,
265
- "section_id": section,
266
- "processing_method": "default"
267
- }
268
- )
269
- return [doc]
270
 
271
- return []
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  def load_table_data(repo_id, hf_token, table_data_dir):
274
- """Modified function with custom table processing integration"""
275
- log_message("Начинаю загрузку табличных данных")
276
 
277
- table_files = []
278
  try:
279
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
280
- for file in files:
281
- if file.startswith(table_data_dir) and file.endswith('.json'):
282
- table_files.append(file)
283
 
284
  log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
285
 
286
  table_documents = []
287
  for file_path in table_files:
288
  try:
289
- log_message(f"Обрабатываю файл: {file_path}")
290
  local_path = hf_hub_download(
291
  repo_id=repo_id,
292
  filename=file_path,
@@ -304,39 +235,18 @@ def load_table_data(repo_id, hf_token, table_data_dir):
304
  if 'sheets' in table_data:
305
  for sheet in table_data['sheets']:
306
  sheet['document'] = document_id
307
- # Check if this table uses custom processing
308
- table_num = sheet.get('table_number', 'Неизвестно')
309
- use_custom, _, _ = should_use_custom_processing(document_id, table_num)
310
-
311
- if use_custom:
312
- log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
313
-
314
  docs_list = table_to_document(sheet, document_id)
315
  table_documents.extend(docs_list)
316
  else:
317
- # Check if this table uses custom processing
318
- table_num = table_data.get('table_number', 'Неизвестно')
319
- use_custom, _, _ = should_use_custom_processing(document_id, table_num)
320
-
321
- if use_custom:
322
- log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
323
-
324
  docs_list = table_to_document(table_data, document_id)
325
  table_documents.extend(docs_list)
326
  elif isinstance(table_data, list):
327
  for table_json in table_data:
328
- document_id = table_json.get('document', 'unknown')
329
- table_num = table_json.get('table_number', 'Неизвестно')
330
- use_custom, _, _ = should_use_custom_processing(document_id, table_num)
331
-
332
- if use_custom:
333
- log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
334
-
335
  docs_list = table_to_document(table_json)
336
  table_documents.extend(docs_list)
337
 
338
  except Exception as e:
339
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
340
  continue
341
 
342
  log_message(f"Создано {len(table_documents)} документов из таблиц")
 
 
1
  from collections import defaultdict
2
  import json
 
 
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
  CUSTOM_TABLE_CONFIGS = {
8
  "ГОСТ Р 50.05.01-2018": {
9
+ "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
10
+ "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
 
 
 
 
 
 
 
 
 
 
 
 
11
  },
12
+ "ГОСТ Р 50.06.01-2017": {"№ Б.2": {"method": "split_by_rows"}},
13
+ "НП-104-18": {"*": {"method": "group_entire_table"}},
14
  "НП-068-05": {
15
+ "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
16
+ "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
17
+ "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
 
 
18
  },
19
  "ГОСТ Р 59023.1-2020": {
20
+ "№ 1": {"method": "split_by_rows"},
21
+ "№ 2": {"method": "split_by_rows"},
22
+ "№ 3": {"method": "split_by_rows"}
 
 
 
 
 
 
 
23
  },
24
+ "НП-089-15": {"-": {"method": "split_by_rows"}},
25
+ "НП-105-18": {"№ 4.8": {"method": "group_entire_table"}},
26
+ "ГОСТ Р 50.05.23-2020": {"№8": {"method": "group_entire_table"}},
27
+ "ГОСТ Р 50.03.01-2017": {"А.8": {"method": "group_entire_table"}}
 
 
 
 
 
 
 
 
 
 
 
28
  }
29
 
30
  def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
 
36
  return base_info
37
 
38
  def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
39
+ chunk_lines = [meta_info.rstrip()]
40
+ chunk_lines.append("Заголовки: " + " | ".join(headers))
 
 
 
41
 
 
42
  for i, row in enumerate(rows, start=1):
43
+ row_parts = [f"{h}: {row.get(h, '')}" for h in headers if row.get(h, '')]
 
 
 
 
 
44
  if add_row_numbers:
45
  chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
46
  else:
47
  chunk_lines.append(' | '.join(row_parts))
48
 
49
  return "\n".join(chunk_lines)
50
+
51
+ def get_custom_config(document_id, table_number):
52
+ for doc_pattern, tables_config in CUSTOM_TABLE_CONFIGS.items():
53
+ if document_id.startswith(doc_pattern):
54
+ return tables_config.get(table_number, tables_config.get("*"))
55
+ return None
56
+
57
  def group_by_column_method(table_data, document_name, group_column):
 
58
  documents = []
59
  headers = table_data.get("headers", [])
60
  rows = table_data.get("data", [])
 
64
 
65
  grouped = defaultdict(list)
66
  for row in rows:
67
+ grouped[row.get(group_column, "UNKNOWN")].append(row)
 
68
 
69
  for group_value, group_rows in grouped.items():
70
  meta_info = create_meta_info(document_name, section, table_number, table_title,
71
  f'Группа по "{group_column}": {group_value}')
 
72
  chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
73
 
74
+ documents.append(Document(
75
  text=chunk_text,
76
  metadata={
77
  "type": "table",
 
85
  "total_rows": len(group_rows),
86
  "processing_method": "group_by_column"
87
  }
88
+ ))
 
 
89
 
90
  return documents
91
 
92
  def split_by_rows_method(table_data, document_name):
 
93
  documents = []
94
  headers = table_data.get("headers", [])
95
  rows = table_data.get("data", [])
 
99
 
100
  for i, row in enumerate(rows, start=1):
101
  meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
 
102
  chunk_text = create_chunk_text(meta_info, headers, [row])
103
 
104
+ documents.append(Document(
105
  text=chunk_text,
106
  metadata={
107
  "type": "table",
 
114
  "total_rows": len(rows),
115
  "processing_method": "split_by_rows"
116
  }
117
+ ))
 
118
 
 
119
  return documents
120
 
121
  def group_entire_table_method(table_data, document_name):
 
122
  headers = table_data.get("headers", [])
123
  rows = table_data.get("data", [])
124
  section = table_data.get("section", "")
 
128
  meta_info = create_meta_info(document_name, section, table_number, table_title)
129
  chunk_text = create_chunk_text(meta_info, headers, rows)
130
 
131
+ return [Document(
132
  text=chunk_text,
133
  metadata={
134
  "type": "table",
 
140
  "total_rows": len(rows),
141
  "processing_method": "group_entire_table"
142
  }
143
+ )]
 
 
 
144
 
145
+ def process_table(table_data, document_name, method_config):
 
 
 
 
 
 
 
 
 
 
146
  method = method_config.get("method")
147
 
148
  if method == "group_by_column":
149
+ return group_by_column_method(table_data, document_name, method_config.get("group_column"))
 
150
  elif method == "split_by_rows":
151
  return split_by_rows_method(table_data, document_name)
152
  elif method == "group_entire_table":
153
  return group_entire_table_method(table_data, document_name)
154
+ return None
 
 
155
 
156
  def table_to_document(table_data, document_id=None):
157
+ if not isinstance(table_data, dict):
158
+ return []
159
+
160
+ doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
161
+ table_num = table_data.get('table_number', 'Неизвестно')
162
+ table_title = table_data.get('table_title', 'Неизвестно')
163
+ section = table_data.get('section', 'Неизвестно')
164
+
165
+ method_config = get_custom_config(doc_id, table_num)
166
+
167
+ if method_config:
168
+ log_message(f"✓ Таблица {table_num} '{table_title}' в документе {doc_id}: метод {method_config['method']}")
169
+ custom_docs = process_table(table_data, doc_id, method_config)
170
+ if custom_docs:
171
+ return custom_docs
172
+
173
+ header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
174
+
175
+ if 'data' in table_data and isinstance(table_data['data'], list):
176
+ table_content = header_content + "\nДанные таблицы:\n"
177
+ for row_idx, row in enumerate(table_data['data']):
178
+ if isinstance(row, dict):
179
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
180
+ table_content += f"Строка {row_idx + 1}: {row_text}\n"
181
 
182
+ return [Document(
183
+ text=table_content,
184
+ metadata={
185
+ "type": "table",
186
+ "table_number": table_num,
187
+ "table_title": table_title,
188
+ "document_id": doc_id,
189
+ "section": section,
190
+ "section_id": section,
191
+ "total_rows": len(table_data['data']),
192
+ "processing_method": "default"
193
+ }
194
+ )]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ return [Document(
197
+ text=header_content,
198
+ metadata={
199
+ "type": "table",
200
+ "table_number": table_num,
201
+ "table_title": table_title,
202
+ "document_id": doc_id,
203
+ "section": section,
204
+ "section_id": section,
205
+ "processing_method": "default"
206
+ }
207
+ )]
208
 
209
  def load_table_data(repo_id, hf_token, table_data_dir):
210
+ log_message("Загрузка табличных данных")
 
211
 
 
212
  try:
213
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
214
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
 
 
215
 
216
  log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
217
 
218
  table_documents = []
219
  for file_path in table_files:
220
  try:
 
221
  local_path = hf_hub_download(
222
  repo_id=repo_id,
223
  filename=file_path,
 
235
  if 'sheets' in table_data:
236
  for sheet in table_data['sheets']:
237
  sheet['document'] = document_id
 
 
 
 
 
 
 
238
  docs_list = table_to_document(sheet, document_id)
239
  table_documents.extend(docs_list)
240
  else:
 
 
 
 
 
 
 
241
  docs_list = table_to_document(table_data, document_id)
242
  table_documents.extend(docs_list)
243
  elif isinstance(table_data, list):
244
  for table_json in table_data:
 
 
 
 
 
 
 
245
  docs_list = table_to_document(table_json)
246
  table_documents.extend(docs_list)
247
 
248
  except Exception as e:
249
+ log_message(f"Ошибка файла {file_path}: {str(e)}")
250
  continue
251
 
252
  log_message(f"Создано {len(table_documents)} документов из таблиц")
utils.py CHANGED
@@ -371,39 +371,15 @@ def generate_sources_html(nodes, chunks_df=None):
371
 
372
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
373
  if query_engine is None:
374
- return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
375
 
376
  try:
377
- log_message(f"Получен вопрос: {question}")
378
  start_time = time.time()
379
 
380
- # Извлечение узлов
381
  retrieved_nodes = query_engine.retriever.retrieve(question)
382
- log_message(f"Извлечено {len(retrieved_nodes)} узлов")
383
-
384
- # ДЕТАЛЬНОЕ ЛОГИРОВАНИЕ ИСТОЧНИКОВ
385
- log_message("=== ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О НАЙДЕННЫХ УЗЛАХ ===")
386
- for i, node in enumerate(retrieved_nodes):
387
- log_message(f"Узел {i+1}:")
388
- log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
389
- log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
390
- log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
391
- log_message(f" Текст (первые 400 символов): {node.text[:400]}...")
392
- log_message(f" Метаданные: {node.metadata}")
393
-
394
- # Переранжировка
395
  reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
396
 
397
- log_message("=== УЗЛЫ ПОСЛЕ ПЕРЕРАНЖИРОВКИ ===")
398
- for i, node in enumerate(reranked_nodes):
399
- log_message(f"Переранжированный узел {i+1}:")
400
- log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
401
- log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
402
- log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
403
- log_message(f" Полный текст: {node.text}")
404
-
405
  formatted_context = format_context_for_llm(reranked_nodes)
406
- log_message(f"ПОЛНЫЙ КОНТЕКСТ ДЛЯ LLM:\n{formatted_context}")
407
 
408
  enhanced_question = f"""
409
  Контекст из базы данных:
@@ -413,12 +389,10 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
413
 
414
  response = query_engine.query(enhanced_question)
415
 
416
- log_message(f"ОТВЕТ LLM: {response.response}")
417
-
418
  end_time = time.time()
419
  processing_time = end_time - start_time
420
 
421
- log_message(f"Обработка завершена за {processing_time:.2f} секунд")
422
 
423
  sources_html = generate_sources_html(reranked_nodes, chunks_df)
424
 
@@ -432,10 +406,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
432
 
433
  chunk_info = []
434
  for node in reranked_nodes:
435
- section_id = node.metadata.get('section_id', node.metadata.get('section', 'unknown'))
436
  chunk_info.append({
437
- 'document_id': node.metadata.get('document_id', 'unknown'),
438
- 'section_id': section_id,
 
 
 
 
 
 
 
 
439
  'chunk_size': len(node.text),
440
  'chunk_text': node.text
441
  })
@@ -445,6 +427,6 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
445
  return answer_with_time, sources_html, chunks_html
446
 
447
  except Exception as e:
448
- log_message(f"Ошибка обработки вопроса: {str(e)}")
449
- error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
450
- return error_msg, ""
 
371
 
372
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
373
  if query_engine is None:
374
+ return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
375
 
376
  try:
 
377
  start_time = time.time()
378
 
 
379
  retrieved_nodes = query_engine.retriever.retrieve(question)
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
381
 
 
 
 
 
 
 
 
 
382
  formatted_context = format_context_for_llm(reranked_nodes)
 
383
 
384
  enhanced_question = f"""
385
  Контекст из базы данных:
 
389
 
390
  response = query_engine.query(enhanced_question)
391
 
 
 
392
  end_time = time.time()
393
  processing_time = end_time - start_time
394
 
395
+ log_message(f"Обработка завершена за {processing_time:.2f}с")
396
 
397
  sources_html = generate_sources_html(reranked_nodes, chunks_df)
398
 
 
406
 
407
  chunk_info = []
408
  for node in reranked_nodes:
409
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
410
  chunk_info.append({
411
+ 'document_id': metadata.get('document_id', 'unknown'),
412
+ 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
413
+ 'section_path': metadata.get('section_path', ''),
414
+ 'section_text': metadata.get('section_text', ''),
415
+ 'level': metadata.get('level', ''),
416
+ 'parent_section': metadata.get('parent_section', ''),
417
+ 'parent_title': metadata.get('parent_title', ''),
418
+ 'type': metadata.get('type', 'text'),
419
+ 'table_number': metadata.get('table_number', ''),
420
+ 'image_number': metadata.get('image_number', ''),
421
  'chunk_size': len(node.text),
422
  'chunk_text': node.text
423
  })
 
427
  return answer_with_time, sources_html, chunks_html
428
 
429
  except Exception as e:
430
+ log_message(f"Ошибка: {str(e)}")
431
+ error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
432
+ return error_msg, "", ""