Spaces:
Sleeping
Sleeping
File size: 7,365 Bytes
07e9959 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
import json
import os
from pathlib import Path
from config import TABLE_DATA_DIR, MAX_CHARS_TABLE, MAX_ROWS_TABLE
def normalize_text(text):
"""Normalize text for consistency"""
if not text:
return text
text = text.replace('С-', 'C')
import re
text = re.sub(r'\bС(\d)', r'С\1', text)
return text
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
"""Format table header"""
content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
if table_title:
content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
if section:
content += f"РАЗДЕЛ: {section}\n"
content += f"{'='*70}\n"
if headers:
header_str = ' | '.join(str(h) for h in headers)
content += f"ЗАГОЛОВКИ: {header_str}\n\n"
content += "ДАННЫЕ:\n"
return content
def format_single_row(row, idx):
"""Format a single row"""
if isinstance(row, dict):
parts = [f"{k}: {v}" for k, v in row.items()
if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
if parts:
return f"{idx}. {' | '.join(parts)}\n"
elif isinstance(row, list):
parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
if parts:
return f"{idx}. {' | '.join(parts)}\n"
return ""
def format_table_rows(rows):
"""Format multiple rows"""
content = ""
for row in rows:
idx = row.get('_idx', 0)
content += format_single_row(row, idx)
return content
def format_table_footer(table_identifier, doc_id):
"""Format table footer"""
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
"""Split table into chunks"""
headers = table_data.get('headers', [])
rows = table_data.get('data', [])
table_num = table_data.get('table_number', 'unknown')
table_title = table_data.get('table_title', '')
section = table_data.get('section', '')
table_num_clean = str(table_num).strip()
table_title_normalized = normalize_text(str(table_title))
import re
if 'приложени' in section.lower():
appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
if appendix_match:
appendix_num = appendix_match.group(1).upper()
table_identifier = f"{table_num_clean} Приложение {appendix_num}"
else:
table_identifier = table_num_clean
else:
table_identifier = table_num_clean
if not rows:
return []
# Calculate base metadata size
base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
base_size = len(base_content)
available_space = max_chars - base_size - 200
# If entire table fits, return as one chunk
full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
return [content]
# Split into chunks
chunks = []
current_rows = []
current_size = 0
for i, row in enumerate(rows):
row_text = format_single_row(row, i + 1)
row_size = len(row_text)
should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
if should_split:
content = base_content + format_table_rows(current_rows)
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
content += format_table_footer(table_identifier, doc_id)
chunks.append(content)
current_rows = []
current_size = 0
row_copy = row.copy() if isinstance(row, dict) else {'data': row}
row_copy['_idx'] = i + 1
current_rows.append(row_copy)
current_size += row_size
# Add final chunk
if current_rows:
content = base_content + format_table_rows(current_rows)
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
content += format_table_footer(table_identifier, doc_id)
chunks.append(content)
return chunks
def export_table_chunks():
"""Main function to export table chunks to text files"""
# Create output directory
output_dir = Path("table_chunks_export")
output_dir.mkdir(exist_ok=True)
# Get all JSON files from table data directory
table_dir = Path(TABLE_DATA_DIR)
if not table_dir.exists():
print(f"❌ Directory not found: {TABLE_DATA_DIR}")
return
json_files = list(table_dir.glob("*.json"))
print(f"📁 Found {len(json_files)} JSON files in {TABLE_DATA_DIR}")
total_chunks = 0
total_files = 0
for json_file in json_files:
try:
print(f"\n📄 Processing: {json_file.name}")
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
doc_id = data.get('document_id', data.get('document', 'unknown'))
# Collect all chunks for this JSON file
all_chunks = []
for sheet_idx, sheet in enumerate(data.get('sheets', [])):
sheet_doc_id = sheet.get('document_id', sheet.get('document', doc_id))
# Get chunks for this table
chunks = chunk_table_by_content(sheet, sheet_doc_id)
all_chunks.extend(chunks)
# Save all chunks to one text file
if all_chunks:
# Use the JSON filename (without .json) for the txt file
output_filename = json_file.stem + ".txt"
output_path = output_dir / output_filename
with open(output_path, 'w', encoding='utf-8') as f:
for i, chunk in enumerate(all_chunks):
f.write(f"\n{'#'*70}\n")
f.write(f"CHUNK {i+1} of {len(all_chunks)}\n")
f.write(f"{'#'*70}\n\n")
f.write(chunk)
f.write("\n\n")
print(f" ✓ Saved: {output_filename} with {len(all_chunks)} chunks")
total_chunks += len(all_chunks)
total_files += 1
else:
print(f" ⚠️ No chunks found in {json_file.name}")
except Exception as e:
print(f" ❌ Error processing {json_file.name}: {e}")
print(f"\n{'='*60}")
print(f"✅ Export complete!")
print(f" Total files created: {total_files}")
print(f" Total chunks exported: {total_chunks}")
print(f" Output directory: {output_dir.absolute()}")
print(f"{'='*60}")
if __name__ == "__main__":
export_table_chunks() |