MrSimple07 commited on
Commit
4ce52d0
·
1 Parent(s): 1b689ce

fix returns a tuple (major, minor) instead of a float.

Browse files
Files changed (1) hide show
  1. table_prep.py +116 -28
table_prep.py CHANGED
@@ -4,6 +4,80 @@ from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def create_table_content(table_data):
8
  """Create formatted content from table data"""
9
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
@@ -30,7 +104,7 @@ def create_table_content(table_data):
30
  return content
31
 
32
  def table_to_document(table_data, document_id=None):
33
- """Convert table data to a single Document"""
34
  if not isinstance(table_data, dict):
35
  return []
36
 
@@ -39,10 +113,18 @@ def table_to_document(table_data, document_id=None):
39
  table_title = table_data.get('table_title', 'Неизвестно')
40
  section = table_data.get('section', 'Неизвестно')
41
 
 
 
 
 
 
 
 
 
 
42
  content = create_table_content(table_data)
43
  content_size = len(content)
44
 
45
- # Log table addition
46
  row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
47
  log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
48
  f"Размер: {content_size} символов | Строк: {row_count}")
@@ -61,6 +143,27 @@ def table_to_document(table_data, document_id=None):
61
  }
62
  )]
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def load_table_data(repo_id, hf_token, table_data_dir):
65
  log_message("=" * 60)
66
  log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
@@ -76,7 +179,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
76
  stats = {
77
  'total_tables': 0,
78
  'total_size': 0,
79
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
80
  }
81
 
82
  for file_path in table_files:
@@ -98,7 +201,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
98
  document_id = table_data.get('document', 'unknown')
99
 
100
  if 'sheets' in table_data:
101
- # Sort sheets by table_number to ensure correct order
102
  sorted_sheets = sorted(
103
  table_data['sheets'],
104
  key=lambda x: extract_table_number(x.get('table_number', ''))
@@ -110,21 +213,25 @@ def load_table_data(repo_id, hf_token, table_data_dir):
110
  table_documents.extend(docs_list)
111
 
112
  for doc in docs_list:
 
113
  stats['total_tables'] += 1
114
  size = doc.metadata.get('content_size', 0)
115
  stats['total_size'] += size
116
  stats['by_document'][document_id]['count'] += 1
117
  stats['by_document'][document_id]['size'] += size
 
118
  else:
119
  docs_list = table_to_document(table_data, document_id)
120
  table_documents.extend(docs_list)
121
 
122
  for doc in docs_list:
 
123
  stats['total_tables'] += 1
124
  size = doc.metadata.get('content_size', 0)
125
  stats['total_size'] += size
126
  stats['by_document'][document_id]['count'] += 1
127
  stats['by_document'][document_id]['size'] += size
 
128
 
129
  elif isinstance(table_data, list):
130
  # Sort list by table_number
@@ -139,11 +246,13 @@ def load_table_data(repo_id, hf_token, table_data_dir):
139
 
140
  for doc in docs_list:
141
  doc_id = doc.metadata.get('document_id', 'unknown')
 
142
  stats['total_tables'] += 1
143
  size = doc.metadata.get('content_size', 0)
144
  stats['total_size'] += size
145
  stats['by_document'][doc_id]['count'] += 1
146
  stats['by_document'][doc_id]['size'] += size
 
147
 
148
  except Exception as e:
149
  log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
@@ -161,6 +270,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
161
  for doc_id, doc_stats in sorted(stats['by_document'].items()):
162
  log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
163
  f"{doc_stats['size']:,} символов")
 
 
164
 
165
  log_message("=" * 60)
166
 
@@ -168,27 +279,4 @@ def load_table_data(repo_id, hf_token, table_data_dir):
168
 
169
  except Exception as e:
170
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
171
- return []
172
-
173
-
174
- def extract_table_number(table_number_str):
175
- """Extract numeric value from table number for sorting"""
176
- import re
177
- if not table_number_str:
178
- return (0, 0)
179
-
180
- # Remove "№" and extract numbers
181
- cleaned = re.sub(r'[^0-9.]', '', str(table_number_str))
182
-
183
- try:
184
- # Split by dot to handle hierarchical numbering
185
- parts = cleaned.split('.')
186
- if len(parts) == 2:
187
- # Return tuple for proper sorting: (major, minor)
188
- return (int(parts[0]), int(parts[1]))
189
- elif len(parts) == 1 and parts[0]:
190
- return (int(parts[0]), 0)
191
- else:
192
- return (0, 0)
193
- except (ValueError, IndexError):
194
- return (0, 0)
 
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
+ # Custom table configurations
8
+ CUSTOM_TABLE_CONFIGS = {
9
+ "НП-104-18": {
10
+ "tables": {} # Add specific tables here if needed
11
+ },
12
+ "НП-105-18": {
13
+ "tables": {
14
+ "№ 4.8": {"method": "group_entire_table"}
15
+ }
16
+ },
17
+ "ГОСТ Р 50.05.23-2020": {
18
+ "tables": {
19
+ "№8": {"method": "group_entire_table"}
20
+ }
21
+ },
22
+ "ГОСТ Р 50.03.01-2017": {
23
+ "tables": {
24
+ "А.8": {"method": "group_entire_table"}
25
+ }
26
+ }
27
+ }
28
+
29
+ def create_meta_info(document_name, section, table_number, table_title):
30
+ """Create metadata information for table"""
31
+ meta = f"Таблица: {table_number}\n"
32
+ meta += f"Название: {table_title}\n"
33
+ meta += f"Документ: {document_name}\n"
34
+ meta += f"Раздел: {section}\n"
35
+ return meta
36
+
37
+ def create_chunk_text(meta_info, headers, rows):
38
+ """Create formatted text from table data"""
39
+ chunk_text = meta_info
40
+
41
+ if headers:
42
+ chunk_text += f"\nЗаголовки: {' | '.join(headers)}\n"
43
+
44
+ chunk_text += "\nДанные таблицы:\n"
45
+ for row_idx, row in enumerate(rows, start=1):
46
+ if isinstance(row, dict):
47
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
48
+ chunk_text += f"Строка {row_idx}: {row_text}\n"
49
+
50
+ return chunk_text
51
+
52
+ def group_entire_table_method(table_data, document_name):
53
+ """Group entire table as one chunk"""
54
+ headers = table_data.get("headers", [])
55
+ rows = table_data.get("data", [])
56
+ section = table_data.get("section", "")
57
+ table_number = table_data.get("table_number", "")
58
+ table_title = table_data.get("table_title", "")
59
+
60
+ meta_info = create_meta_info(document_name, section, table_number, table_title)
61
+ chunk_text = create_chunk_text(meta_info, headers, rows)
62
+
63
+ doc = Document(
64
+ text=chunk_text,
65
+ metadata={
66
+ "type": "table",
67
+ "table_number": table_number,
68
+ "table_title": table_title,
69
+ "document_id": document_name,
70
+ "section": section,
71
+ "section_id": section,
72
+ "total_rows": len(rows),
73
+ "processing_method": "group_entire_table",
74
+ "content_size": len(chunk_text)
75
+ }
76
+ )
77
+
78
+ log_message(f"✓ GROUPED ENTIRE TABLE: {table_number}, rows: {len(rows)}, size: {len(chunk_text)} символов")
79
+ return [doc]
80
+
81
  def create_table_content(table_data):
82
  """Create formatted content from table data"""
83
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
 
104
  return content
105
 
106
  def table_to_document(table_data, document_id=None):
107
+ """Convert table data to a single Document with custom processing support"""
108
  if not isinstance(table_data, dict):
109
  return []
110
 
 
113
  table_title = table_data.get('table_title', 'Неизвестно')
114
  section = table_data.get('section', 'Неизвестно')
115
 
116
+ # Check for custom processing
117
+ if doc_id in CUSTOM_TABLE_CONFIGS:
118
+ doc_config = CUSTOM_TABLE_CONFIGS[doc_id]
119
+ if table_num in doc_config.get("tables", {}):
120
+ method = doc_config["tables"][table_num].get("method")
121
+ if method == "group_entire_table":
122
+ return group_entire_table_method(table_data, doc_id)
123
+
124
+ # Default processing
125
  content = create_table_content(table_data)
126
  content_size = len(content)
127
 
 
128
  row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
129
  log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
130
  f"Размер: {content_size} символов | Строк: {row_count}")
 
143
  }
144
  )]
145
 
146
+ def extract_table_number(table_number_str):
147
+ """Extract numeric value from table number for sorting"""
148
+ import re
149
+ if not table_number_str:
150
+ return 0
151
+
152
+ # Remove "№" and whitespace
153
+ cleaned = str(table_number_str).replace('№', '').strip()
154
+
155
+ # Try to extract the numeric part (handles formats like "9.1", "9.30", "А.8")
156
+ match = re.search(r'(\d+)\.?(\d*)', cleaned)
157
+ if match:
158
+ major = int(match.group(1))
159
+ minor = int(match.group(2)) if match.group(2) else 0
160
+ # Create sortable number: major * 1000 + minor
161
+ # This ensures 9.2 comes before 9.30
162
+ return major * 1000 + minor
163
+
164
+ # If no numbers found, try alphabetic sorting
165
+ return hash(cleaned)
166
+
167
  def load_table_data(repo_id, hf_token, table_data_dir):
168
  log_message("=" * 60)
169
  log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
 
179
  stats = {
180
  'total_tables': 0,
181
  'total_size': 0,
182
+ 'by_document': defaultdict(lambda: {'count': 0, 'size': 0, 'tables': []})
183
  }
184
 
185
  for file_path in table_files:
 
201
  document_id = table_data.get('document', 'unknown')
202
 
203
  if 'sheets' in table_data:
204
+ # Sort sheets by table_number
205
  sorted_sheets = sorted(
206
  table_data['sheets'],
207
  key=lambda x: extract_table_number(x.get('table_number', ''))
 
213
  table_documents.extend(docs_list)
214
 
215
  for doc in docs_list:
216
+ table_num = doc.metadata.get('table_number', '')
217
  stats['total_tables'] += 1
218
  size = doc.metadata.get('content_size', 0)
219
  stats['total_size'] += size
220
  stats['by_document'][document_id]['count'] += 1
221
  stats['by_document'][document_id]['size'] += size
222
+ stats['by_document'][document_id]['tables'].append(table_num)
223
  else:
224
  docs_list = table_to_document(table_data, document_id)
225
  table_documents.extend(docs_list)
226
 
227
  for doc in docs_list:
228
+ table_num = doc.metadata.get('table_number', '')
229
  stats['total_tables'] += 1
230
  size = doc.metadata.get('content_size', 0)
231
  stats['total_size'] += size
232
  stats['by_document'][document_id]['count'] += 1
233
  stats['by_document'][document_id]['size'] += size
234
+ stats['by_document'][document_id]['tables'].append(table_num)
235
 
236
  elif isinstance(table_data, list):
237
  # Sort list by table_number
 
246
 
247
  for doc in docs_list:
248
  doc_id = doc.metadata.get('document_id', 'unknown')
249
+ table_num = doc.metadata.get('table_number', '')
250
  stats['total_tables'] += 1
251
  size = doc.metadata.get('content_size', 0)
252
  stats['total_size'] += size
253
  stats['by_document'][doc_id]['count'] += 1
254
  stats['by_document'][doc_id]['size'] += size
255
+ stats['by_document'][doc_id]['tables'].append(table_num)
256
 
257
  except Exception as e:
258
  log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
 
270
  for doc_id, doc_stats in sorted(stats['by_document'].items()):
271
  log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
272
  f"{doc_stats['size']:,} символов")
273
+ log_message(f" Таблицы: {', '.join(doc_stats['tables'][:10])}"
274
+ f"{'...' if len(doc_stats['tables']) > 10 else ''}")
275
 
276
  log_message("=" * 60)
277
 
 
279
 
280
  except Exception as e:
281
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
282
+ return []