Spaces:
Sleeping
Sleeping
Commit ·
bf93cc0
1
Parent(s): dc23650
simple
Browse files- table_prep.py +16 -82
table_prep.py
CHANGED
|
@@ -5,6 +5,7 @@ from llama_index.core import Document
|
|
| 5 |
from my_logging import log_message
|
| 6 |
|
| 7 |
def create_table_content(table_data):
|
|
|
|
| 8 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 9 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 10 |
table_title = table_data.get('table_title', 'Неизвестно')
|
|
@@ -32,36 +33,32 @@ def table_to_document(table_data, document_id=None):
|
|
| 32 |
"""Convert table data to a single Document"""
|
| 33 |
if not isinstance(table_data, dict):
|
| 34 |
return []
|
| 35 |
-
|
| 36 |
doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 37 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 38 |
table_title = table_data.get('table_title', 'Неизвестно')
|
| 39 |
section = table_data.get('section', 'Неизвестно')
|
| 40 |
-
|
| 41 |
-
|
| 42 |
content = create_table_content(table_data)
|
| 43 |
content_size = len(content)
|
| 44 |
-
|
|
|
|
| 45 |
row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
|
| 46 |
log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
|
| 47 |
f"Размер: {content_size} символов | Строк: {row_count}")
|
| 48 |
-
|
| 49 |
-
metadata = {
|
| 50 |
-
"type": "table",
|
| 51 |
-
"table_number": table_num,
|
| 52 |
-
"table_title": table_title,
|
| 53 |
-
"document_id": doc_id,
|
| 54 |
-
"section": section,
|
| 55 |
-
"section_id": section,
|
| 56 |
-
"total_rows": row_count,
|
| 57 |
-
"content_size": content_size
|
| 58 |
-
}
|
| 59 |
-
if sheet_name:
|
| 60 |
-
metadata["sheet_name"] = sheet_name
|
| 61 |
-
|
| 62 |
return [Document(
|
| 63 |
text=content,
|
| 64 |
-
metadata=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
)]
|
| 66 |
|
| 67 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
|
@@ -152,66 +149,3 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 152 |
except Exception as e:
|
| 153 |
log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 154 |
return []
|
| 155 |
-
|
| 156 |
-
CUSTOM_TABLE_CONFIGS = {
|
| 157 |
-
"НП-105-18": {
|
| 158 |
-
"tables": {
|
| 159 |
-
"№ 4.8": {"method": "group_entire_table"}
|
| 160 |
-
}
|
| 161 |
-
},
|
| 162 |
-
"ГОСТ Р 50.05.23-2020": {
|
| 163 |
-
"tables": {
|
| 164 |
-
"№8": {"method": "group_entire_table"}
|
| 165 |
-
}
|
| 166 |
-
},
|
| 167 |
-
"ГОСТ Р 50.03.01-2017": {
|
| 168 |
-
"tables": {
|
| 169 |
-
"А.8": {"method": "group_entire_table"}
|
| 170 |
-
}
|
| 171 |
-
}
|
| 172 |
-
}
|
| 173 |
-
|
| 174 |
-
def create_meta_info(document_name, section, table_number, table_title):
|
| 175 |
-
meta_info = f"Документ: {document_name}\n"
|
| 176 |
-
meta_info += f"Раздел: {section}\n"
|
| 177 |
-
meta_info += f"Таблица: {table_number}\n"
|
| 178 |
-
meta_info += f"Название таблицы: {table_title}\n"
|
| 179 |
-
return meta_info
|
| 180 |
-
|
| 181 |
-
def create_chunk_text(meta_info, headers, rows):
|
| 182 |
-
|
| 183 |
-
header_line = ", ".join(headers)
|
| 184 |
-
row_lines = ["; ".join(map(str, row)) for row in rows]
|
| 185 |
-
chunk = f"Meta: {meta_info}\nHeaders: {header_line}\nRows:\n" + "\n".join(row_lines)
|
| 186 |
-
return chunk
|
| 187 |
-
|
| 188 |
-
def group_entire_table_method(table_data, document_name):
|
| 189 |
-
"""Group entire table as one chunk"""
|
| 190 |
-
headers = table_data.get("headers", [])
|
| 191 |
-
rows = table_data.get("data", [])
|
| 192 |
-
section = table_data.get("section", "")
|
| 193 |
-
table_number = table_data.get("table_number", "")
|
| 194 |
-
table_title = table_data.get("table_title", "")
|
| 195 |
-
sheet_name = table_data.get("sheet_name", None)
|
| 196 |
-
|
| 197 |
-
meta_info = create_meta_info(document_name, section, table_number, table_title)
|
| 198 |
-
chunk_text = create_chunk_text(meta_info, headers, rows)
|
| 199 |
-
metadata = {
|
| 200 |
-
"type": "table",
|
| 201 |
-
"table_number": table_number,
|
| 202 |
-
"table_title": table_title,
|
| 203 |
-
"document_id": document_name,
|
| 204 |
-
"section": section,
|
| 205 |
-
"section_id": section,
|
| 206 |
-
"total_rows": len(rows),
|
| 207 |
-
"processing_method": "group_entire_table"
|
| 208 |
-
}
|
| 209 |
-
if sheet_name:
|
| 210 |
-
metadata["sheet_name"] = sheet_name
|
| 211 |
-
|
| 212 |
-
doc = Document(
|
| 213 |
-
text=chunk_text,
|
| 214 |
-
metadata=metadata
|
| 215 |
-
)
|
| 216 |
-
log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
|
| 217 |
-
return [doc]
|
|
|
|
| 5 |
from my_logging import log_message
|
| 6 |
|
| 7 |
def create_table_content(table_data):
|
| 8 |
+
"""Create formatted content from table data"""
|
| 9 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 10 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 11 |
table_title = table_data.get('table_title', 'Неизвестно')
|
|
|
|
| 33 |
"""Convert table data to a single Document"""
|
| 34 |
if not isinstance(table_data, dict):
|
| 35 |
return []
|
| 36 |
+
|
| 37 |
doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 38 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 39 |
table_title = table_data.get('table_title', 'Неизвестно')
|
| 40 |
section = table_data.get('section', 'Неизвестно')
|
| 41 |
+
|
|
|
|
| 42 |
content = create_table_content(table_data)
|
| 43 |
content_size = len(content)
|
| 44 |
+
|
| 45 |
+
# Log table addition
|
| 46 |
row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
|
| 47 |
log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
|
| 48 |
f"Размер: {content_size} символов | Строк: {row_count}")
|
| 49 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
return [Document(
|
| 51 |
text=content,
|
| 52 |
+
metadata={
|
| 53 |
+
"type": "table",
|
| 54 |
+
"table_number": table_num,
|
| 55 |
+
"table_title": table_title,
|
| 56 |
+
"document_id": doc_id,
|
| 57 |
+
"section": section,
|
| 58 |
+
"section_id": section,
|
| 59 |
+
"total_rows": row_count,
|
| 60 |
+
"content_size": content_size
|
| 61 |
+
}
|
| 62 |
)]
|
| 63 |
|
| 64 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
|
|
|
| 149 |
except Exception as e:
|
| 150 |
log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 151 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|