MrSimple07 commited on
Commit
6562b97
·
1 Parent(s): 4ce52d0

fix returns a tuple (major, minor) instead of a float.

Browse files
Files changed (1) hide show
  1. table_prep.py +86 -150
table_prep.py CHANGED
@@ -4,80 +4,6 @@ from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
- # Custom table configurations
8
- CUSTOM_TABLE_CONFIGS = {
9
- "НП-104-18": {
10
- "tables": {} # Add specific tables here if needed
11
- },
12
- "НП-105-18": {
13
- "tables": {
14
- "№ 4.8": {"method": "group_entire_table"}
15
- }
16
- },
17
- "ГОСТ Р 50.05.23-2020": {
18
- "tables": {
19
- "№8": {"method": "group_entire_table"}
20
- }
21
- },
22
- "ГОСТ Р 50.03.01-2017": {
23
- "tables": {
24
- "А.8": {"method": "group_entire_table"}
25
- }
26
- }
27
- }
28
-
29
- def create_meta_info(document_name, section, table_number, table_title):
30
- """Create metadata information for table"""
31
- meta = f"Таблица: {table_number}\n"
32
- meta += f"Название: {table_title}\n"
33
- meta += f"Документ: {document_name}\n"
34
- meta += f"Раздел: {section}\n"
35
- return meta
36
-
37
- def create_chunk_text(meta_info, headers, rows):
38
- """Create formatted text from table data"""
39
- chunk_text = meta_info
40
-
41
- if headers:
42
- chunk_text += f"\nЗаголовки: {' | '.join(headers)}\n"
43
-
44
- chunk_text += "\nДанные таблицы:\n"
45
- for row_idx, row in enumerate(rows, start=1):
46
- if isinstance(row, dict):
47
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
48
- chunk_text += f"Строка {row_idx}: {row_text}\n"
49
-
50
- return chunk_text
51
-
52
- def group_entire_table_method(table_data, document_name):
53
- """Group entire table as one chunk"""
54
- headers = table_data.get("headers", [])
55
- rows = table_data.get("data", [])
56
- section = table_data.get("section", "")
57
- table_number = table_data.get("table_number", "")
58
- table_title = table_data.get("table_title", "")
59
-
60
- meta_info = create_meta_info(document_name, section, table_number, table_title)
61
- chunk_text = create_chunk_text(meta_info, headers, rows)
62
-
63
- doc = Document(
64
- text=chunk_text,
65
- metadata={
66
- "type": "table",
67
- "table_number": table_number,
68
- "table_title": table_title,
69
- "document_id": document_name,
70
- "section": section,
71
- "section_id": section,
72
- "total_rows": len(rows),
73
- "processing_method": "group_entire_table",
74
- "content_size": len(chunk_text)
75
- }
76
- )
77
-
78
- log_message(f"✓ GROUPED ENTIRE TABLE: {table_number}, rows: {len(rows)}, size: {len(chunk_text)} символов")
79
- return [doc]
80
-
81
  def create_table_content(table_data):
82
  """Create formatted content from table data"""
83
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
@@ -104,66 +30,41 @@ def create_table_content(table_data):
104
  return content
105
 
106
  def table_to_document(table_data, document_id=None):
107
- """Convert table data to a single Document with custom processing support"""
108
  if not isinstance(table_data, dict):
109
  return []
110
-
111
  doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
112
  table_num = table_data.get('table_number', 'Неизвестно')
113
  table_title = table_data.get('table_title', 'Неизвестно')
114
  section = table_data.get('section', 'Неизвестно')
115
-
116
- # Check for custom processing
117
- if doc_id in CUSTOM_TABLE_CONFIGS:
118
- doc_config = CUSTOM_TABLE_CONFIGS[doc_id]
119
- if table_num in doc_config.get("tables", {}):
120
- method = doc_config["tables"][table_num].get("method")
121
- if method == "group_entire_table":
122
- return group_entire_table_method(table_data, doc_id)
123
-
124
- # Default processing
125
  content = create_table_content(table_data)
126
  content_size = len(content)
127
-
128
  row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
129
  log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
130
  f"Размер: {content_size} символов | Строк: {row_count}")
131
-
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  return [Document(
133
  text=content,
134
- metadata={
135
- "type": "table",
136
- "table_number": table_num,
137
- "table_title": table_title,
138
- "document_id": doc_id,
139
- "section": section,
140
- "section_id": section,
141
- "total_rows": row_count,
142
- "content_size": content_size
143
- }
144
  )]
145
 
146
- def extract_table_number(table_number_str):
147
- """Extract numeric value from table number for sorting"""
148
- import re
149
- if not table_number_str:
150
- return 0
151
-
152
- # Remove "№" and whitespace
153
- cleaned = str(table_number_str).replace('№', '').strip()
154
-
155
- # Try to extract the numeric part (handles formats like "9.1", "9.30", "А.8")
156
- match = re.search(r'(\d+)\.?(\d*)', cleaned)
157
- if match:
158
- major = int(match.group(1))
159
- minor = int(match.group(2)) if match.group(2) else 0
160
- # Create sortable number: major * 1000 + minor
161
- # This ensures 9.2 comes before 9.30
162
- return major * 1000 + minor
163
-
164
- # If no numbers found, try alphabetic sorting
165
- return hash(cleaned)
166
-
167
  def load_table_data(repo_id, hf_token, table_data_dir):
168
  log_message("=" * 60)
169
  log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
@@ -179,7 +80,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
179
  stats = {
180
  'total_tables': 0,
181
  'total_size': 0,
182
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0, 'tables': []})
183
  }
184
 
185
  for file_path in table_files:
@@ -201,11 +102,9 @@ def load_table_data(repo_id, hf_token, table_data_dir):
201
  document_id = table_data.get('document', 'unknown')
202
 
203
  if 'sheets' in table_data:
204
- # Sort sheets by table_number
205
  sorted_sheets = sorted(
206
  table_data['sheets'],
207
- key=lambda x: extract_table_number(x.get('table_number', ''))
208
- )
209
 
210
  for sheet in sorted_sheets:
211
  sheet['document'] = document_id
@@ -213,46 +112,22 @@ def load_table_data(repo_id, hf_token, table_data_dir):
213
  table_documents.extend(docs_list)
214
 
215
  for doc in docs_list:
216
- table_num = doc.metadata.get('table_number', '')
217
  stats['total_tables'] += 1
218
  size = doc.metadata.get('content_size', 0)
219
  stats['total_size'] += size
220
  stats['by_document'][document_id]['count'] += 1
221
  stats['by_document'][document_id]['size'] += size
222
- stats['by_document'][document_id]['tables'].append(table_num)
223
  else:
224
  docs_list = table_to_document(table_data, document_id)
225
  table_documents.extend(docs_list)
226
 
227
  for doc in docs_list:
228
- table_num = doc.metadata.get('table_number', '')
229
  stats['total_tables'] += 1
230
  size = doc.metadata.get('content_size', 0)
231
  stats['total_size'] += size
232
  stats['by_document'][document_id]['count'] += 1
233
  stats['by_document'][document_id]['size'] += size
234
- stats['by_document'][document_id]['tables'].append(table_num)
235
 
236
- elif isinstance(table_data, list):
237
- # Sort list by table_number
238
- sorted_tables = sorted(
239
- table_data,
240
- key=lambda x: extract_table_number(x.get('table_number', ''))
241
- )
242
-
243
- for table_json in sorted_tables:
244
- docs_list = table_to_document(table_json)
245
- table_documents.extend(docs_list)
246
-
247
- for doc in docs_list:
248
- doc_id = doc.metadata.get('document_id', 'unknown')
249
- table_num = doc.metadata.get('table_number', '')
250
- stats['total_tables'] += 1
251
- size = doc.metadata.get('content_size', 0)
252
- stats['total_size'] += size
253
- stats['by_document'][doc_id]['count'] += 1
254
- stats['by_document'][doc_id]['size'] += size
255
- stats['by_document'][doc_id]['tables'].append(table_num)
256
 
257
  except Exception as e:
258
  log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
@@ -270,8 +145,6 @@ def load_table_data(repo_id, hf_token, table_data_dir):
270
  for doc_id, doc_stats in sorted(stats['by_document'].items()):
271
  log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
272
  f"{doc_stats['size']:,} символов")
273
- log_message(f" Таблицы: {', '.join(doc_stats['tables'][:10])}"
274
- f"{'...' if len(doc_stats['tables']) > 10 else ''}")
275
 
276
  log_message("=" * 60)
277
 
@@ -279,4 +152,67 @@ def load_table_data(repo_id, hf_token, table_data_dir):
279
 
280
  except Exception as e:
281
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
282
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def create_table_content(table_data):
8
  """Create formatted content from table data"""
9
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
 
30
  return content
31
 
32
  def table_to_document(table_data, document_id=None):
33
+ """Convert table data to a single Document"""
34
  if not isinstance(table_data, dict):
35
  return []
36
+
37
  doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
38
  table_num = table_data.get('table_number', 'Неизвестно')
39
  table_title = table_data.get('table_title', 'Неизвестно')
40
  section = table_data.get('section', 'Неизвестно')
41
+ sheet_name = table_data.get('sheet_name', None) # <-- add this
42
+
 
 
 
 
 
 
 
 
43
  content = create_table_content(table_data)
44
  content_size = len(content)
45
+
46
  row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
47
  log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
48
  f"Размер: {content_size} символов | Строк: {row_count}")
49
+
50
+ metadata = {
51
+ "type": "table",
52
+ "table_number": table_num,
53
+ "table_title": table_title,
54
+ "document_id": doc_id,
55
+ "section": section,
56
+ "section_id": section,
57
+ "total_rows": row_count,
58
+ "content_size": content_size
59
+ }
60
+ if sheet_name:
61
+ metadata["sheet_name"] = sheet_name # <-- add this
62
+
63
  return [Document(
64
  text=content,
65
+ metadata=metadata
 
 
 
 
 
 
 
 
 
66
  )]
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def load_table_data(repo_id, hf_token, table_data_dir):
69
  log_message("=" * 60)
70
  log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
 
80
  stats = {
81
  'total_tables': 0,
82
  'total_size': 0,
83
+ 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
84
  }
85
 
86
  for file_path in table_files:
 
102
  document_id = table_data.get('document', 'unknown')
103
 
104
  if 'sheets' in table_data:
 
105
  sorted_sheets = sorted(
106
  table_data['sheets'],
107
+ )
 
108
 
109
  for sheet in sorted_sheets:
110
  sheet['document'] = document_id
 
112
  table_documents.extend(docs_list)
113
 
114
  for doc in docs_list:
 
115
  stats['total_tables'] += 1
116
  size = doc.metadata.get('content_size', 0)
117
  stats['total_size'] += size
118
  stats['by_document'][document_id]['count'] += 1
119
  stats['by_document'][document_id]['size'] += size
 
120
  else:
121
  docs_list = table_to_document(table_data, document_id)
122
  table_documents.extend(docs_list)
123
 
124
  for doc in docs_list:
 
125
  stats['total_tables'] += 1
126
  size = doc.metadata.get('content_size', 0)
127
  stats['total_size'] += size
128
  stats['by_document'][document_id]['count'] += 1
129
  stats['by_document'][document_id]['size'] += size
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  except Exception as e:
133
  log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
 
145
  for doc_id, doc_stats in sorted(stats['by_document'].items()):
146
  log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
147
  f"{doc_stats['size']:,} символов")
 
 
148
 
149
  log_message("=" * 60)
150
 
 
152
 
153
  except Exception as e:
154
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
155
+ return []
156
+
157
+ CUSTOM_TABLE_CONFIGS = {
158
+ "НП-105-18": {
159
+ "tables": {
160
+ "№ 4.8": {"method": "group_entire_table"}
161
+ }
162
+ },
163
+ "ГОСТ Р 50.05.23-2020": {
164
+ "tables": {
165
+ "№8": {"method": "group_entire_table"}
166
+ }
167
+ },
168
+ "ГОСТ Р 50.03.01-2017": {
169
+ "tables": {
170
+ "А.8": {"method": "group_entire_table"}
171
+ }
172
+ }
173
+ }
174
+
175
+ def create_meta_info(document_name, section, table_number, table_title):
176
+ meta_info = f"Документ: {document_name}\n"
177
+ meta_info += f"Раздел: {section}\n"
178
+ meta_info += f"Таблица: {table_number}\n"
179
+ meta_info += f"Название таблицы: {table_title}\n"
180
+ return meta_info
181
+
182
+ def create_chunk_text(meta_info, headers, rows):
183
+
184
+ header_line = ", ".join(headers)
185
+ row_lines = ["; ".join(map(str, row)) for row in rows]
186
+ chunk = f"Meta: {meta_info}\nHeaders: {header_line}\nRows:\n" + "\n".join(row_lines)
187
+ return chunk
188
+
189
+ def group_entire_table_method(table_data, document_name):
190
+ """Group entire table as one chunk"""
191
+ headers = table_data.get("headers", [])
192
+ rows = table_data.get("data", [])
193
+ section = table_data.get("section", "")
194
+ table_number = table_data.get("table_number", "")
195
+ table_title = table_data.get("table_title", "")
196
+ sheet_name = table_data.get("sheet_name", None)
197
+
198
+ meta_info = create_meta_info(document_name, section, table_number, table_title)
199
+ chunk_text = create_chunk_text(meta_info, headers, rows)
200
+ metadata = {
201
+ "type": "table",
202
+ "table_number": table_number,
203
+ "table_title": table_title,
204
+ "document_id": document_name,
205
+ "section": section,
206
+ "section_id": section,
207
+ "total_rows": len(rows),
208
+ "processing_method": "group_entire_table"
209
+ }
210
+ if sheet_name:
211
+ metadata["sheet_name"] = sheet_name
212
+
213
+ doc = Document(
214
+ text=chunk_text,
215
+ metadata=metadata
216
+ )
217
+ log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
218
+ return [doc]