Spaces:
Sleeping
Sleeping
Commit ·
6562b97
1
Parent(s): 4ce52d0
fix returns a tuple (major, minor) instead of a float.
Browse files- table_prep.py +86 -150
table_prep.py
CHANGED
|
@@ -4,80 +4,6 @@ from huggingface_hub import hf_hub_download, list_repo_files
|
|
| 4 |
from llama_index.core import Document
|
| 5 |
from my_logging import log_message
|
| 6 |
|
| 7 |
-
# Custom table configurations
|
| 8 |
-
CUSTOM_TABLE_CONFIGS = {
|
| 9 |
-
"НП-104-18": {
|
| 10 |
-
"tables": {} # Add specific tables here if needed
|
| 11 |
-
},
|
| 12 |
-
"НП-105-18": {
|
| 13 |
-
"tables": {
|
| 14 |
-
"№ 4.8": {"method": "group_entire_table"}
|
| 15 |
-
}
|
| 16 |
-
},
|
| 17 |
-
"ГОСТ Р 50.05.23-2020": {
|
| 18 |
-
"tables": {
|
| 19 |
-
"№8": {"method": "group_entire_table"}
|
| 20 |
-
}
|
| 21 |
-
},
|
| 22 |
-
"ГОСТ Р 50.03.01-2017": {
|
| 23 |
-
"tables": {
|
| 24 |
-
"А.8": {"method": "group_entire_table"}
|
| 25 |
-
}
|
| 26 |
-
}
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
def create_meta_info(document_name, section, table_number, table_title):
|
| 30 |
-
"""Create metadata information for table"""
|
| 31 |
-
meta = f"Таблица: {table_number}\n"
|
| 32 |
-
meta += f"Название: {table_title}\n"
|
| 33 |
-
meta += f"Документ: {document_name}\n"
|
| 34 |
-
meta += f"Раздел: {section}\n"
|
| 35 |
-
return meta
|
| 36 |
-
|
| 37 |
-
def create_chunk_text(meta_info, headers, rows):
|
| 38 |
-
"""Create formatted text from table data"""
|
| 39 |
-
chunk_text = meta_info
|
| 40 |
-
|
| 41 |
-
if headers:
|
| 42 |
-
chunk_text += f"\nЗаголовки: {' | '.join(headers)}\n"
|
| 43 |
-
|
| 44 |
-
chunk_text += "\nДанные таблицы:\n"
|
| 45 |
-
for row_idx, row in enumerate(rows, start=1):
|
| 46 |
-
if isinstance(row, dict):
|
| 47 |
-
row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
|
| 48 |
-
chunk_text += f"Строка {row_idx}: {row_text}\n"
|
| 49 |
-
|
| 50 |
-
return chunk_text
|
| 51 |
-
|
| 52 |
-
def group_entire_table_method(table_data, document_name):
|
| 53 |
-
"""Group entire table as one chunk"""
|
| 54 |
-
headers = table_data.get("headers", [])
|
| 55 |
-
rows = table_data.get("data", [])
|
| 56 |
-
section = table_data.get("section", "")
|
| 57 |
-
table_number = table_data.get("table_number", "")
|
| 58 |
-
table_title = table_data.get("table_title", "")
|
| 59 |
-
|
| 60 |
-
meta_info = create_meta_info(document_name, section, table_number, table_title)
|
| 61 |
-
chunk_text = create_chunk_text(meta_info, headers, rows)
|
| 62 |
-
|
| 63 |
-
doc = Document(
|
| 64 |
-
text=chunk_text,
|
| 65 |
-
metadata={
|
| 66 |
-
"type": "table",
|
| 67 |
-
"table_number": table_number,
|
| 68 |
-
"table_title": table_title,
|
| 69 |
-
"document_id": document_name,
|
| 70 |
-
"section": section,
|
| 71 |
-
"section_id": section,
|
| 72 |
-
"total_rows": len(rows),
|
| 73 |
-
"processing_method": "group_entire_table",
|
| 74 |
-
"content_size": len(chunk_text)
|
| 75 |
-
}
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
log_message(f"✓ GROUPED ENTIRE TABLE: {table_number}, rows: {len(rows)}, size: {len(chunk_text)} символов")
|
| 79 |
-
return [doc]
|
| 80 |
-
|
| 81 |
def create_table_content(table_data):
|
| 82 |
"""Create formatted content from table data"""
|
| 83 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
|
@@ -104,66 +30,41 @@ def create_table_content(table_data):
|
|
| 104 |
return content
|
| 105 |
|
| 106 |
def table_to_document(table_data, document_id=None):
|
| 107 |
-
"""Convert table data to a single Document
|
| 108 |
if not isinstance(table_data, dict):
|
| 109 |
return []
|
| 110 |
-
|
| 111 |
doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 112 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 113 |
table_title = table_data.get('table_title', 'Неизвестно')
|
| 114 |
section = table_data.get('section', 'Неизвестно')
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
if doc_id in CUSTOM_TABLE_CONFIGS:
|
| 118 |
-
doc_config = CUSTOM_TABLE_CONFIGS[doc_id]
|
| 119 |
-
if table_num in doc_config.get("tables", {}):
|
| 120 |
-
method = doc_config["tables"][table_num].get("method")
|
| 121 |
-
if method == "group_entire_table":
|
| 122 |
-
return group_entire_table_method(table_data, doc_id)
|
| 123 |
-
|
| 124 |
-
# Default processing
|
| 125 |
content = create_table_content(table_data)
|
| 126 |
content_size = len(content)
|
| 127 |
-
|
| 128 |
row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
|
| 129 |
log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
|
| 130 |
f"Размер: {content_size} символов | Строк: {row_count}")
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
return [Document(
|
| 133 |
text=content,
|
| 134 |
-
metadata=
|
| 135 |
-
"type": "table",
|
| 136 |
-
"table_number": table_num,
|
| 137 |
-
"table_title": table_title,
|
| 138 |
-
"document_id": doc_id,
|
| 139 |
-
"section": section,
|
| 140 |
-
"section_id": section,
|
| 141 |
-
"total_rows": row_count,
|
| 142 |
-
"content_size": content_size
|
| 143 |
-
}
|
| 144 |
)]
|
| 145 |
|
| 146 |
-
def extract_table_number(table_number_str):
|
| 147 |
-
"""Extract numeric value from table number for sorting"""
|
| 148 |
-
import re
|
| 149 |
-
if not table_number_str:
|
| 150 |
-
return 0
|
| 151 |
-
|
| 152 |
-
# Remove "№" and whitespace
|
| 153 |
-
cleaned = str(table_number_str).replace('№', '').strip()
|
| 154 |
-
|
| 155 |
-
# Try to extract the numeric part (handles formats like "9.1", "9.30", "А.8")
|
| 156 |
-
match = re.search(r'(\d+)\.?(\d*)', cleaned)
|
| 157 |
-
if match:
|
| 158 |
-
major = int(match.group(1))
|
| 159 |
-
minor = int(match.group(2)) if match.group(2) else 0
|
| 160 |
-
# Create sortable number: major * 1000 + minor
|
| 161 |
-
# This ensures 9.2 comes before 9.30
|
| 162 |
-
return major * 1000 + minor
|
| 163 |
-
|
| 164 |
-
# If no numbers found, try alphabetic sorting
|
| 165 |
-
return hash(cleaned)
|
| 166 |
-
|
| 167 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 168 |
log_message("=" * 60)
|
| 169 |
log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
|
|
@@ -179,7 +80,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 179 |
stats = {
|
| 180 |
'total_tables': 0,
|
| 181 |
'total_size': 0,
|
| 182 |
-
'by_document': defaultdict(lambda: {'count': 0, 'size': 0
|
| 183 |
}
|
| 184 |
|
| 185 |
for file_path in table_files:
|
|
@@ -201,11 +102,9 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 201 |
document_id = table_data.get('document', 'unknown')
|
| 202 |
|
| 203 |
if 'sheets' in table_data:
|
| 204 |
-
# Sort sheets by table_number
|
| 205 |
sorted_sheets = sorted(
|
| 206 |
table_data['sheets'],
|
| 207 |
-
|
| 208 |
-
)
|
| 209 |
|
| 210 |
for sheet in sorted_sheets:
|
| 211 |
sheet['document'] = document_id
|
|
@@ -213,46 +112,22 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 213 |
table_documents.extend(docs_list)
|
| 214 |
|
| 215 |
for doc in docs_list:
|
| 216 |
-
table_num = doc.metadata.get('table_number', '')
|
| 217 |
stats['total_tables'] += 1
|
| 218 |
size = doc.metadata.get('content_size', 0)
|
| 219 |
stats['total_size'] += size
|
| 220 |
stats['by_document'][document_id]['count'] += 1
|
| 221 |
stats['by_document'][document_id]['size'] += size
|
| 222 |
-
stats['by_document'][document_id]['tables'].append(table_num)
|
| 223 |
else:
|
| 224 |
docs_list = table_to_document(table_data, document_id)
|
| 225 |
table_documents.extend(docs_list)
|
| 226 |
|
| 227 |
for doc in docs_list:
|
| 228 |
-
table_num = doc.metadata.get('table_number', '')
|
| 229 |
stats['total_tables'] += 1
|
| 230 |
size = doc.metadata.get('content_size', 0)
|
| 231 |
stats['total_size'] += size
|
| 232 |
stats['by_document'][document_id]['count'] += 1
|
| 233 |
stats['by_document'][document_id]['size'] += size
|
| 234 |
-
stats['by_document'][document_id]['tables'].append(table_num)
|
| 235 |
|
| 236 |
-
elif isinstance(table_data, list):
|
| 237 |
-
# Sort list by table_number
|
| 238 |
-
sorted_tables = sorted(
|
| 239 |
-
table_data,
|
| 240 |
-
key=lambda x: extract_table_number(x.get('table_number', ''))
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
for table_json in sorted_tables:
|
| 244 |
-
docs_list = table_to_document(table_json)
|
| 245 |
-
table_documents.extend(docs_list)
|
| 246 |
-
|
| 247 |
-
for doc in docs_list:
|
| 248 |
-
doc_id = doc.metadata.get('document_id', 'unknown')
|
| 249 |
-
table_num = doc.metadata.get('table_number', '')
|
| 250 |
-
stats['total_tables'] += 1
|
| 251 |
-
size = doc.metadata.get('content_size', 0)
|
| 252 |
-
stats['total_size'] += size
|
| 253 |
-
stats['by_document'][doc_id]['count'] += 1
|
| 254 |
-
stats['by_document'][doc_id]['size'] += size
|
| 255 |
-
stats['by_document'][doc_id]['tables'].append(table_num)
|
| 256 |
|
| 257 |
except Exception as e:
|
| 258 |
log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
|
|
@@ -270,8 +145,6 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 270 |
for doc_id, doc_stats in sorted(stats['by_document'].items()):
|
| 271 |
log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
|
| 272 |
f"{doc_stats['size']:,} символов")
|
| 273 |
-
log_message(f" Таблицы: {', '.join(doc_stats['tables'][:10])}"
|
| 274 |
-
f"{'...' if len(doc_stats['tables']) > 10 else ''}")
|
| 275 |
|
| 276 |
log_message("=" * 60)
|
| 277 |
|
|
@@ -279,4 +152,67 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 279 |
|
| 280 |
except Exception as e:
|
| 281 |
log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 282 |
-
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from llama_index.core import Document
|
| 5 |
from my_logging import log_message
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def create_table_content(table_data):
|
| 8 |
"""Create formatted content from table data"""
|
| 9 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
|
|
|
| 30 |
return content
|
| 31 |
|
| 32 |
def table_to_document(table_data, document_id=None):
|
| 33 |
+
"""Convert table data to a single Document"""
|
| 34 |
if not isinstance(table_data, dict):
|
| 35 |
return []
|
| 36 |
+
|
| 37 |
doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 38 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 39 |
table_title = table_data.get('table_title', 'Неизвестно')
|
| 40 |
section = table_data.get('section', 'Неизвестно')
|
| 41 |
+
sheet_name = table_data.get('sheet_name', None) # <-- add this
|
| 42 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
content = create_table_content(table_data)
|
| 44 |
content_size = len(content)
|
| 45 |
+
|
| 46 |
row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
|
| 47 |
log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
|
| 48 |
f"Размер: {content_size} символов | Строк: {row_count}")
|
| 49 |
+
|
| 50 |
+
metadata = {
|
| 51 |
+
"type": "table",
|
| 52 |
+
"table_number": table_num,
|
| 53 |
+
"table_title": table_title,
|
| 54 |
+
"document_id": doc_id,
|
| 55 |
+
"section": section,
|
| 56 |
+
"section_id": section,
|
| 57 |
+
"total_rows": row_count,
|
| 58 |
+
"content_size": content_size
|
| 59 |
+
}
|
| 60 |
+
if sheet_name:
|
| 61 |
+
metadata["sheet_name"] = sheet_name # <-- add this
|
| 62 |
+
|
| 63 |
return [Document(
|
| 64 |
text=content,
|
| 65 |
+
metadata=metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
)]
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 69 |
log_message("=" * 60)
|
| 70 |
log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
|
|
|
|
| 80 |
stats = {
|
| 81 |
'total_tables': 0,
|
| 82 |
'total_size': 0,
|
| 83 |
+
'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
|
| 84 |
}
|
| 85 |
|
| 86 |
for file_path in table_files:
|
|
|
|
| 102 |
document_id = table_data.get('document', 'unknown')
|
| 103 |
|
| 104 |
if 'sheets' in table_data:
|
|
|
|
| 105 |
sorted_sheets = sorted(
|
| 106 |
table_data['sheets'],
|
| 107 |
+
)
|
|
|
|
| 108 |
|
| 109 |
for sheet in sorted_sheets:
|
| 110 |
sheet['document'] = document_id
|
|
|
|
| 112 |
table_documents.extend(docs_list)
|
| 113 |
|
| 114 |
for doc in docs_list:
|
|
|
|
| 115 |
stats['total_tables'] += 1
|
| 116 |
size = doc.metadata.get('content_size', 0)
|
| 117 |
stats['total_size'] += size
|
| 118 |
stats['by_document'][document_id]['count'] += 1
|
| 119 |
stats['by_document'][document_id]['size'] += size
|
|
|
|
| 120 |
else:
|
| 121 |
docs_list = table_to_document(table_data, document_id)
|
| 122 |
table_documents.extend(docs_list)
|
| 123 |
|
| 124 |
for doc in docs_list:
|
|
|
|
| 125 |
stats['total_tables'] += 1
|
| 126 |
size = doc.metadata.get('content_size', 0)
|
| 127 |
stats['total_size'] += size
|
| 128 |
stats['by_document'][document_id]['count'] += 1
|
| 129 |
stats['by_document'][document_id]['size'] += size
|
|
|
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
except Exception as e:
|
| 133 |
log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
|
|
|
|
| 145 |
for doc_id, doc_stats in sorted(stats['by_document'].items()):
|
| 146 |
log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
|
| 147 |
f"{doc_stats['size']:,} символов")
|
|
|
|
|
|
|
| 148 |
|
| 149 |
log_message("=" * 60)
|
| 150 |
|
|
|
|
| 152 |
|
| 153 |
except Exception as e:
|
| 154 |
log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 155 |
+
return []
|
| 156 |
+
|
| 157 |
+
CUSTOM_TABLE_CONFIGS = {
|
| 158 |
+
"НП-105-18": {
|
| 159 |
+
"tables": {
|
| 160 |
+
"№ 4.8": {"method": "group_entire_table"}
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"ГОСТ Р 50.05.23-2020": {
|
| 164 |
+
"tables": {
|
| 165 |
+
"№8": {"method": "group_entire_table"}
|
| 166 |
+
}
|
| 167 |
+
},
|
| 168 |
+
"ГОСТ Р 50.03.01-2017": {
|
| 169 |
+
"tables": {
|
| 170 |
+
"А.8": {"method": "group_entire_table"}
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
def create_meta_info(document_name, section, table_number, table_title):
|
| 176 |
+
meta_info = f"Документ: {document_name}\n"
|
| 177 |
+
meta_info += f"Раздел: {section}\n"
|
| 178 |
+
meta_info += f"Таблица: {table_number}\n"
|
| 179 |
+
meta_info += f"Название таблицы: {table_title}\n"
|
| 180 |
+
return meta_info
|
| 181 |
+
|
| 182 |
+
def create_chunk_text(meta_info, headers, rows):
|
| 183 |
+
|
| 184 |
+
header_line = ", ".join(headers)
|
| 185 |
+
row_lines = ["; ".join(map(str, row)) for row in rows]
|
| 186 |
+
chunk = f"Meta: {meta_info}\nHeaders: {header_line}\nRows:\n" + "\n".join(row_lines)
|
| 187 |
+
return chunk
|
| 188 |
+
|
| 189 |
+
def group_entire_table_method(table_data, document_name):
|
| 190 |
+
"""Group entire table as one chunk"""
|
| 191 |
+
headers = table_data.get("headers", [])
|
| 192 |
+
rows = table_data.get("data", [])
|
| 193 |
+
section = table_data.get("section", "")
|
| 194 |
+
table_number = table_data.get("table_number", "")
|
| 195 |
+
table_title = table_data.get("table_title", "")
|
| 196 |
+
sheet_name = table_data.get("sheet_name", None)
|
| 197 |
+
|
| 198 |
+
meta_info = create_meta_info(document_name, section, table_number, table_title)
|
| 199 |
+
chunk_text = create_chunk_text(meta_info, headers, rows)
|
| 200 |
+
metadata = {
|
| 201 |
+
"type": "table",
|
| 202 |
+
"table_number": table_number,
|
| 203 |
+
"table_title": table_title,
|
| 204 |
+
"document_id": document_name,
|
| 205 |
+
"section": section,
|
| 206 |
+
"section_id": section,
|
| 207 |
+
"total_rows": len(rows),
|
| 208 |
+
"processing_method": "group_entire_table"
|
| 209 |
+
}
|
| 210 |
+
if sheet_name:
|
| 211 |
+
metadata["sheet_name"] = sheet_name
|
| 212 |
+
|
| 213 |
+
doc = Document(
|
| 214 |
+
text=chunk_text,
|
| 215 |
+
metadata=metadata
|
| 216 |
+
)
|
| 217 |
+
log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
|
| 218 |
+
return [doc]
|