MrSimple07 commited on
Commit
bf93cc0
·
1 Parent(s): dc23650
Files changed (1) hide show
  1. table_prep.py +16 -82
table_prep.py CHANGED
@@ -5,6 +5,7 @@ from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
  def create_table_content(table_data):
 
8
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
9
  table_num = table_data.get('table_number', 'Неизвестно')
10
  table_title = table_data.get('table_title', 'Неизвестно')
@@ -32,36 +33,32 @@ def table_to_document(table_data, document_id=None):
32
  """Convert table data to a single Document"""
33
  if not isinstance(table_data, dict):
34
  return []
35
-
36
  doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
37
  table_num = table_data.get('table_number', 'Неизвестно')
38
  table_title = table_data.get('table_title', 'Неизвестно')
39
  section = table_data.get('section', 'Неизвестно')
40
- sheet_name = table_data.get('sheet_name', None) # <-- add this
41
-
42
  content = create_table_content(table_data)
43
  content_size = len(content)
44
-
 
45
  row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
46
  log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
47
  f"Размер: {content_size} символов | Строк: {row_count}")
48
-
49
- metadata = {
50
- "type": "table",
51
- "table_number": table_num,
52
- "table_title": table_title,
53
- "document_id": doc_id,
54
- "section": section,
55
- "section_id": section,
56
- "total_rows": row_count,
57
- "content_size": content_size
58
- }
59
- if sheet_name:
60
- metadata["sheet_name"] = sheet_name
61
-
62
  return [Document(
63
  text=content,
64
- metadata=metadata
 
 
 
 
 
 
 
 
 
65
  )]
66
 
67
  def load_table_data(repo_id, hf_token, table_data_dir):
@@ -152,66 +149,3 @@ def load_table_data(repo_id, hf_token, table_data_dir):
152
  except Exception as e:
153
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
154
  return []
155
-
156
- CUSTOM_TABLE_CONFIGS = {
157
- "НП-105-18": {
158
- "tables": {
159
- "№ 4.8": {"method": "group_entire_table"}
160
- }
161
- },
162
- "ГОСТ Р 50.05.23-2020": {
163
- "tables": {
164
- "№8": {"method": "group_entire_table"}
165
- }
166
- },
167
- "ГОСТ Р 50.03.01-2017": {
168
- "tables": {
169
- "А.8": {"method": "group_entire_table"}
170
- }
171
- }
172
- }
173
-
174
- def create_meta_info(document_name, section, table_number, table_title):
175
- meta_info = f"Документ: {document_name}\n"
176
- meta_info += f"Раздел: {section}\n"
177
- meta_info += f"Таблица: {table_number}\n"
178
- meta_info += f"Название таблицы: {table_title}\n"
179
- return meta_info
180
-
181
- def create_chunk_text(meta_info, headers, rows):
182
-
183
- header_line = ", ".join(headers)
184
- row_lines = ["; ".join(map(str, row)) for row in rows]
185
- chunk = f"Meta: {meta_info}\nHeaders: {header_line}\nRows:\n" + "\n".join(row_lines)
186
- return chunk
187
-
188
- def group_entire_table_method(table_data, document_name):
189
- """Group entire table as one chunk"""
190
- headers = table_data.get("headers", [])
191
- rows = table_data.get("data", [])
192
- section = table_data.get("section", "")
193
- table_number = table_data.get("table_number", "")
194
- table_title = table_data.get("table_title", "")
195
- sheet_name = table_data.get("sheet_name", None)
196
-
197
- meta_info = create_meta_info(document_name, section, table_number, table_title)
198
- chunk_text = create_chunk_text(meta_info, headers, rows)
199
- metadata = {
200
- "type": "table",
201
- "table_number": table_number,
202
- "table_title": table_title,
203
- "document_id": document_name,
204
- "section": section,
205
- "section_id": section,
206
- "total_rows": len(rows),
207
- "processing_method": "group_entire_table"
208
- }
209
- if sheet_name:
210
- metadata["sheet_name"] = sheet_name
211
-
212
- doc = Document(
213
- text=chunk_text,
214
- metadata=metadata
215
- )
216
- log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
217
- return [doc]
 
5
  from my_logging import log_message
6
 
7
  def create_table_content(table_data):
8
+ """Create formatted content from table data"""
9
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
10
  table_num = table_data.get('table_number', 'Неизвестно')
11
  table_title = table_data.get('table_title', 'Неизвестно')
 
33
  """Convert table data to a single Document"""
34
  if not isinstance(table_data, dict):
35
  return []
36
+
37
  doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
38
  table_num = table_data.get('table_number', 'Неизвестно')
39
  table_title = table_data.get('table_title', 'Неизвестно')
40
  section = table_data.get('section', 'Неизвестно')
41
+
 
42
  content = create_table_content(table_data)
43
  content_size = len(content)
44
+
45
+ # Log table addition
46
  row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
47
  log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
48
  f"Размер: {content_size} символов | Строк: {row_count}")
49
+
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  return [Document(
51
  text=content,
52
+ metadata={
53
+ "type": "table",
54
+ "table_number": table_num,
55
+ "table_title": table_title,
56
+ "document_id": doc_id,
57
+ "section": section,
58
+ "section_id": section,
59
+ "total_rows": row_count,
60
+ "content_size": content_size
61
+ }
62
  )]
63
 
64
  def load_table_data(repo_id, hf_token, table_data_dir):
 
149
  except Exception as e:
150
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
151
  return []