File size: 7,365 Bytes
07e9959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import json
import os
from pathlib import Path
from config import TABLE_DATA_DIR, MAX_CHARS_TABLE, MAX_ROWS_TABLE

def normalize_text(text):
    """Normalize text for consistency"""
    if not text:
        return text
    text = text.replace('С-', 'C')
    import re
    text = re.sub(r'\bС(\d)', r'С\1', text)
    return text

def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
    """Format table header"""
    content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
    
    if table_title:
        content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
    
    if section:
        content += f"РАЗДЕЛ: {section}\n"
    
    content += f"{'='*70}\n"
    
    if headers:
        header_str = ' | '.join(str(h) for h in headers)
        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
    
    content += "ДАННЫЕ:\n"
    return content

def format_single_row(row, idx):
    """Format a single row"""
    if isinstance(row, dict):
        parts = [f"{k}: {v}" for k, v in row.items() 
                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
        if parts:
            return f"{idx}. {' | '.join(parts)}\n"
    elif isinstance(row, list):
        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
        if parts:
            return f"{idx}. {' | '.join(parts)}\n"
    return ""

def format_table_rows(rows):
    """Format multiple rows"""
    content = ""
    for row in rows:
        idx = row.get('_idx', 0)
        content += format_single_row(row, idx)
    return content

def format_table_footer(table_identifier, doc_id):
    """Format table footer"""
    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"

def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
    """Split table into chunks"""
    headers = table_data.get('headers', [])
    rows = table_data.get('data', [])
    table_num = table_data.get('table_number', 'unknown')
    table_title = table_data.get('table_title', '')
    section = table_data.get('section', '')
   
    table_num_clean = str(table_num).strip()
    table_title_normalized = normalize_text(str(table_title))
    
    import re
    if 'приложени' in section.lower():
        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
        if appendix_match:
            appendix_num = appendix_match.group(1).upper()
            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
        else:
            table_identifier = table_num_clean
    else:
        table_identifier = table_num_clean
    
    if not rows:
        return []
    
    # Calculate base metadata size
    base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
    base_size = len(base_content)
    available_space = max_chars - base_size - 200 
    
    # If entire table fits, return as one chunk
    full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
    if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
        content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
        return [content]

    # Split into chunks
    chunks = []
    current_rows = []
    current_size = 0
    
    for i, row in enumerate(rows):
        row_text = format_single_row(row, i + 1)
        row_size = len(row_text)
        
        should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
        
        if should_split:
            content = base_content + format_table_rows(current_rows)
            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
            content += format_table_footer(table_identifier, doc_id)
            chunks.append(content)
            
            current_rows = []
            current_size = 0
        
        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
        row_copy['_idx'] = i + 1
        current_rows.append(row_copy)
        current_size += row_size
    
    # Add final chunk
    if current_rows:
        content = base_content + format_table_rows(current_rows)
        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
        content += format_table_footer(table_identifier, doc_id)
        chunks.append(content)
    
    return chunks

def export_table_chunks():
    """Main function to export table chunks to text files"""
    
    # Create output directory
    output_dir = Path("table_chunks_export")
    output_dir.mkdir(exist_ok=True)
    
    # Get all JSON files from table data directory
    table_dir = Path(TABLE_DATA_DIR)
    if not table_dir.exists():
        print(f"❌ Directory not found: {TABLE_DATA_DIR}")
        return
    
    json_files = list(table_dir.glob("*.json"))
    print(f"📁 Found {len(json_files)} JSON files in {TABLE_DATA_DIR}")
    
    total_chunks = 0
    total_files = 0
    
    for json_file in json_files:
        try:
            print(f"\n📄 Processing: {json_file.name}")
            
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            doc_id = data.get('document_id', data.get('document', 'unknown'))
            
            # Collect all chunks for this JSON file
            all_chunks = []
            
            for sheet_idx, sheet in enumerate(data.get('sheets', [])):
                sheet_doc_id = sheet.get('document_id', sheet.get('document', doc_id))
                
                # Get chunks for this table
                chunks = chunk_table_by_content(sheet, sheet_doc_id)
                all_chunks.extend(chunks)
            
            # Save all chunks to one text file
            if all_chunks:
                # Use the JSON filename (without .json) for the txt file
                output_filename = json_file.stem + ".txt"
                output_path = output_dir / output_filename
                
                with open(output_path, 'w', encoding='utf-8') as f:
                    for i, chunk in enumerate(all_chunks):
                        f.write(f"\n{'#'*70}\n")
                        f.write(f"CHUNK {i+1} of {len(all_chunks)}\n")
                        f.write(f"{'#'*70}\n\n")
                        f.write(chunk)
                        f.write("\n\n")
                
                print(f"  ✓ Saved: {output_filename} with {len(all_chunks)} chunks")
                total_chunks += len(all_chunks)
                total_files += 1
            else:
                print(f"  ⚠️ No chunks found in {json_file.name}")
            
        except Exception as e:
            print(f"  ❌ Error processing {json_file.name}: {e}")
    
    print(f"\n{'='*60}")
    print(f"✅ Export complete!")
    print(f"   Total files created: {total_files}")
    print(f"   Total chunks exported: {total_chunks}")
    print(f"   Output directory: {output_dir.absolute()}")
    print(f"{'='*60}")

if __name__ == "__main__":
    export_table_chunks()