MrSimple01 commited on
Commit
051e774
·
verified ·
1 Parent(s): 0bc2e08

Update documents_prep.py

Browse files
Files changed (1) hide show
  1. documents_prep.py +496 -573
documents_prep.py CHANGED
@@ -1,574 +1,497 @@
1
- import json
2
- import zipfile
3
- import pandas as pd
4
- from huggingface_hub import hf_hub_download, list_repo_files
5
- from llama_index.core import Document
6
- from llama_index.core.text_splitter import SentenceSplitter
7
- from my_logging import log_message
8
- from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
9
-
10
- def chunk_text_documents(documents):
11
- text_splitter = SentenceSplitter(
12
- chunk_size=CHUNK_SIZE,
13
- chunk_overlap=CHUNK_OVERLAP
14
- )
15
-
16
- chunked = []
17
- for doc in documents:
18
- chunks = text_splitter.get_nodes_from_documents([doc])
19
- for i, chunk in enumerate(chunks):
20
- chunk.metadata.update({
21
- 'chunk_id': i,
22
- 'total_chunks': len(chunks),
23
- 'chunk_size': len(chunk.text) # Add chunk size
24
- })
25
- chunked.append(chunk)
26
-
27
- # Log statistics
28
- if chunked:
29
- avg_size = sum(len(c.text) for c in chunked) / len(chunked)
30
- min_size = min(len(c.text) for c in chunked)
31
- max_size = max(len(c.text) for c in chunked)
32
- log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
33
- log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
34
-
35
- return chunked
36
-
37
- def normalize_connection_type(s):
38
- # Replace Cyrillic with Latin
39
- s = s.replace('С', 'C').replace('с', 'c')
40
- s = s.replace('У', 'U').replace('у', 'u')
41
- s = s.replace('Т', 'T').replace('т', 't')
42
- s= s.replace('С-', 'C-').replace('с-', 'c-')
43
- s = s.replace('У-', 'U-').replace('у-', 'u-')
44
- s = s.replace('Т-', 'T-').replace('т-', 't-')
45
- # REMOVE ALL HYPHENS for consistent tokenization
46
- s = s.replace('-', '')
47
- return s
48
-
49
- def extract_connection_type(text):
50
- import re
51
- # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
52
- match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
53
- if match:
54
- normalized = normalize_connection_type(match.group(0))
55
- return normalized
56
- return ''
57
-
58
- def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
59
- headers = table_data.get('headers', [])
60
- rows = table_data.get('data', [])
61
- table_num = table_data.get('table_number', 'unknown')
62
- table_title = table_data.get('table_title', '')
63
- section = table_data.get('section', '')
64
- table_description = table_data.get('table_description', '')
65
-
66
- table_num_clean = str(table_num).strip()
67
-
68
- import re
69
- if 'приложени' in section.lower():
70
- appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
71
- if appendix_match:
72
- appendix_num = appendix_match.group(1).upper()
73
- table_identifier = f"{table_num_clean} Приложение {appendix_num}"
74
- else:
75
- table_identifier = table_num_clean
76
- else:
77
- table_identifier = table_num_clean
78
-
79
- if not rows:
80
- return []
81
-
82
- log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
83
-
84
- # Calculate base metadata size - NOW INCLUDING DESCRIPTION
85
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
86
-
87
- # ADD DESCRIPTION HERE if it exists
88
- if table_description:
89
- base_content += f"ОПИСАНИЕ: {table_description}\n\n"
90
-
91
- base_size = len(base_content)
92
- available_space = max_chars - base_size - 200
93
-
94
- # If entire table fits, return as one chunk
95
- full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
96
- if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
97
- content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
98
-
99
- metadata = {
100
- 'type': 'table',
101
- 'document_id': doc_id,
102
- 'table_number': table_num_clean,
103
- 'table_identifier': table_identifier,
104
- 'table_title': table_title,
105
- 'section': section,
106
- 'total_rows': len(rows),
107
- 'chunk_size': len(content),
108
- 'is_complete_table': True,
109
- 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
110
-
111
- }
112
-
113
- log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
114
- return [Document(text=content, metadata=metadata)]
115
-
116
- chunks = []
117
- current_rows = []
118
- current_size = 0
119
- chunk_num = 0
120
-
121
- for i, row in enumerate(rows):
122
- row_text = format_single_row(row, i + 1)
123
- row_size = len(row_text)
124
-
125
- should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
126
-
127
- if should_split:
128
- content = base_content + format_table_rows(current_rows)
129
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
130
- content += format_table_footer(table_identifier, doc_id)
131
-
132
- metadata = {
133
- 'type': 'table',
134
- 'document_id': doc_id,
135
- 'table_number': table_num_clean,
136
- 'table_identifier': table_identifier,
137
- 'table_title': table_title,
138
- 'section': section,
139
- 'chunk_id': chunk_num,
140
- 'row_start': current_rows[0]['_idx'] - 1,
141
- 'row_end': current_rows[-1]['_idx'],
142
- 'total_rows': len(rows),
143
- 'chunk_size': len(content),
144
- 'is_complete_table': False,
145
- 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
146
- }
147
-
148
- chunks.append(Document(text=content, metadata=metadata))
149
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
150
-
151
- chunk_num += 1
152
- current_rows = []
153
- current_size = 0
154
-
155
- # Add row with index
156
- row_copy = row.copy() if isinstance(row, dict) else {'data': row}
157
- row_copy['_idx'] = i + 1
158
- current_rows.append(row_copy)
159
- current_size += row_size
160
-
161
- # Add final chunk
162
- if current_rows:
163
- content = base_content + format_table_rows(current_rows)
164
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
165
- content += format_table_footer(table_identifier, doc_id)
166
-
167
- metadata = {
168
- 'type': 'table',
169
- 'document_id': doc_id,
170
- 'table_number': table_num_clean,
171
- 'table_identifier': table_identifier,
172
- 'table_title': table_title,
173
- 'section': section,
174
- 'chunk_id': chunk_num,
175
- 'row_start': current_rows[0]['_idx'] - 1,
176
- 'row_end': current_rows[-1]['_idx'],
177
- 'total_rows': len(rows),
178
- 'chunk_size': len(content),
179
- 'is_complete_table': False
180
- }
181
-
182
- chunks.append(Document(text=content, metadata=metadata))
183
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
184
-
185
- return chunks
186
-
187
- def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
188
- content = f"ДОКУМЕНТ: {doc_id}\n"
189
- content += f"ТАБЛИЦА: {table_identifier}\n"
190
-
191
- if table_title:
192
- # Normalize the title text itself for better searchability
193
- normalized_title = normalize_connection_type(table_title)
194
- content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
195
-
196
- # Extract and store the normalized connection type
197
- connection_type = extract_connection_type(table_title)
198
- if connection_type:
199
- content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
200
-
201
- if table_num and table_num != table_identifier:
202
- content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
203
-
204
- if section:
205
- content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
206
-
207
- content += f"\n{'='*70}\n"
208
-
209
- if headers:
210
- content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
211
- for i, h in enumerate(headers, 1):
212
- # NORMALIZE HEADERS TOO
213
- normalized_header = normalize_connection_type(h)
214
- content += f" {i}. {normalized_header}\n"
215
- content += "\n"
216
-
217
- content += "ДАННЫЕ ТАБЛИЦЫ:\n"
218
- return content
219
-
220
-
221
- def format_single_row(row, idx):
222
- """Format a single row with normalization"""
223
- if isinstance(row, dict):
224
- # NORMALIZE VALUES IN ROWS
225
- parts = []
226
- for k, v in row.items():
227
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
228
- normalized_v = normalize_connection_type(str(v))
229
- parts.append(f"{k}: {normalized_v}")
230
- if parts:
231
- return f"{idx}. {' | '.join(parts)}\n"
232
- elif isinstance(row, list):
233
- # NORMALIZE LIST VALUES
234
- parts = []
235
- for v in row:
236
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
237
- normalized_v = normalize_connection_type(str(v))
238
- parts.append(normalized_v)
239
- if parts:
240
- return f"{idx}. {' | '.join(parts)}\n"
241
- return ""
242
-
243
- def format_table_rows(rows):
244
- """Format multiple rows"""
245
- content = ""
246
- for row in rows:
247
- idx = row.get('_idx', 0)
248
- content += format_single_row(row, idx)
249
- return content
250
-
251
-
252
- def format_table_footer(table_identifier, doc_id):
253
- """Format table footer"""
254
- return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
255
-
256
- def load_json_documents(repo_id, hf_token, json_dir):
257
- import zipfile
258
- import tempfile
259
- import os
260
-
261
- log_message("Loading JSON documents...")
262
-
263
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
264
- json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
265
- zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
266
-
267
- log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
268
-
269
- documents = []
270
- stats = {'success': 0, 'failed': 0, 'empty': 0}
271
-
272
- for file_path in json_files:
273
- try:
274
- log_message(f" Loading: {file_path}")
275
- local_path = hf_hub_download(
276
- repo_id=repo_id,
277
- filename=file_path,
278
- repo_type="dataset",
279
- token=hf_token
280
- )
281
-
282
- docs = extract_sections_from_json(local_path)
283
- if docs:
284
- documents.extend(docs)
285
- stats['success'] += 1
286
- log_message(f" ✓ Extracted {len(docs)} sections")
287
- else:
288
- stats['empty'] += 1
289
- log_message(f" ⚠ No sections found")
290
-
291
- except Exception as e:
292
- stats['failed'] += 1
293
- log_message(f" ✗ Error: {e}")
294
-
295
- for zip_path in zip_files:
296
- try:
297
- log_message(f" Processing ZIP: {zip_path}")
298
- local_zip = hf_hub_download(
299
- repo_id=repo_id,
300
- filename=zip_path,
301
- repo_type="dataset",
302
- token=hf_token
303
- )
304
-
305
- with zipfile.ZipFile(local_zip, 'r') as zf:
306
- json_files_in_zip = [f for f in zf.namelist()
307
- if f.endswith('.json')
308
- and not f.startswith('__MACOSX')
309
- and not f.startswith('.')
310
- and not '._' in f]
311
-
312
- log_message(f" Found {len(json_files_in_zip)} JSON files in ZIP")
313
-
314
- for json_file in json_files_in_zip:
315
- try:
316
- file_content = zf.read(json_file)
317
-
318
- # Skip if file is too small
319
- if len(file_content) < 10:
320
- log_message(f" ✗ Skipping: {json_file} (file too small)")
321
- stats['failed'] += 1
322
- continue
323
-
324
- # Try UTF-8 first (most common)
325
- try:
326
- text_content = file_content.decode('utf-8')
327
- except UnicodeDecodeError:
328
- try:
329
- text_content = file_content.decode('utf-8-sig')
330
- except UnicodeDecodeError:
331
- try:
332
- # Try UTF-16 (the issue you're seeing)
333
- text_content = file_content.decode('utf-16')
334
- except UnicodeDecodeError:
335
- try:
336
- text_content = file_content.decode('windows-1251')
337
- except UnicodeDecodeError:
338
- log_message(f" Skipping: {json_file} (encoding failed)")
339
- stats['failed'] += 1
340
- continue
341
-
342
- # Validate JSON structure
343
- if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
344
- log_message(f" ✗ Skipping: {json_file} (not valid JSON)")
345
- stats['failed'] += 1
346
- continue
347
-
348
- with tempfile.NamedTemporaryFile(mode='w', delete=False,
349
- suffix='.json', encoding='utf-8') as tmp:
350
- tmp.write(text_content)
351
- tmp_path = tmp.name
352
-
353
- docs = extract_sections_from_json(tmp_path)
354
- if docs:
355
- documents.extend(docs)
356
- stats['success'] += 1
357
- log_message(f" ✓ {json_file}: {len(docs)} sections")
358
- else:
359
- stats['empty'] += 1
360
- log_message(f" ⚠ {json_file}: No sections")
361
-
362
- os.unlink(tmp_path)
363
-
364
- except json.JSONDecodeError as e:
365
- stats['failed'] += 1
366
- log_message(f" ✗ {json_file}: Invalid JSON")
367
- except Exception as e:
368
- stats['failed'] += 1
369
- log_message(f" ✗ {json_file}: {str(e)[:100]}")
370
-
371
- except Exception as e:
372
- log_message(f" ✗ Error with ZIP: {e}")
373
-
374
- log_message(f"="*60)
375
- log_message(f"JSON Loading Stats:")
376
- log_message(f" Success: {stats['success']}")
377
- log_message(f" Empty: {stats['empty']}")
378
- log_message(f" Failed: {stats['failed']}")
379
- log_message(f" Total sections: {len(documents)}")
380
- log_message(f"="*60)
381
-
382
- return documents
383
-
384
- def extract_sections_from_json(json_path):
385
- """Extract sections from a single JSON file"""
386
- documents = []
387
-
388
- try:
389
- with open(json_path, 'r', encoding='utf-8') as f:
390
- data = json.load(f)
391
-
392
- doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
393
-
394
- # Extract all section levels
395
- for section in data.get('sections', []):
396
- if section.get('section_text', '').strip():
397
- documents.append(Document(
398
- text=section['section_text'],
399
- metadata={
400
- 'type': 'text',
401
- 'document_id': doc_id,
402
- 'section_id': section.get('section_id', '')
403
- }
404
- ))
405
-
406
- # Subsections
407
- for subsection in section.get('subsections', []):
408
- if subsection.get('subsection_text', '').strip():
409
- documents.append(Document(
410
- text=subsection['subsection_text'],
411
- metadata={
412
- 'type': 'text',
413
- 'document_id': doc_id,
414
- 'section_id': subsection.get('subsection_id', '')
415
- }
416
- ))
417
-
418
- # Sub-subsections
419
- for sub_sub in subsection.get('sub_subsections', []):
420
- if sub_sub.get('sub_subsection_text', '').strip():
421
- documents.append(Document(
422
- text=sub_sub['sub_subsection_text'],
423
- metadata={
424
- 'type': 'text',
425
- 'document_id': doc_id,
426
- 'section_id': sub_sub.get('sub_subsection_id', '')
427
- }
428
- ))
429
-
430
- except Exception as e:
431
- log_message(f"Error extracting from {json_path}: {e}")
432
-
433
- return documents
434
-
435
-
436
- def load_table_documents(repo_id, hf_token, table_dir):
437
- log_message("Loading tables...")
438
-
439
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
440
- table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
441
-
442
- all_chunks = []
443
- connection_type_sources = {} # Track which table each type comes from
444
-
445
- for file_path in table_files:
446
- try:
447
- local_path = hf_hub_download(
448
- repo_id=repo_id,
449
- filename=file_path,
450
- repo_type="dataset",
451
- token=hf_token
452
- )
453
-
454
- with open(local_path, 'r', encoding='utf-8') as f:
455
- data = json.load(f)
456
-
457
- file_doc_id = data.get('document_id', data.get('document', 'unknown'))
458
-
459
- for sheet in data.get('sheets', []):
460
- sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
461
- table_num = sheet.get('table_number', 'unknown')
462
- table_title = sheet.get('table_title', '')
463
-
464
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
465
- all_chunks.extend(chunks)
466
-
467
- # Track connection type source
468
- conn_type = extract_connection_type(table_title)
469
- if conn_type:
470
- if conn_type not in connection_type_sources:
471
- connection_type_sources[conn_type] = []
472
- connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
473
-
474
- except Exception as e:
475
- log_message(f"Error loading {file_path}: {e}")
476
-
477
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
478
-
479
- log_message("="*60)
480
- log_message("CONNECTION TYPES AND THEIR SOURCES:")
481
- for conn_type in sorted(connection_type_sources.keys()):
482
- sources = connection_type_sources[conn_type]
483
- log_message(f" {conn_type}: {len(sources)} tables")
484
- for src in sources:
485
- log_message(f" - {src}")
486
- log_message("="*60)
487
-
488
- return all_chunks
489
-
490
- def load_image_documents(repo_id, hf_token, image_dir):
491
- """Load image descriptions"""
492
- log_message("Loading images...")
493
-
494
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
495
- csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
496
-
497
- documents = []
498
- for file_path in csv_files:
499
- try:
500
- local_path = hf_hub_download(
501
- repo_id=repo_id,
502
- filename=file_path,
503
- repo_type="dataset",
504
- token=hf_token
505
- )
506
-
507
- df = pd.read_csv(local_path)
508
-
509
- for _, row in df.iterrows():
510
- content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
511
- content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
512
- content += f"Название: {row.get('Название изображения', '')}\n"
513
- content += f"Описание: {row.get('Описание изображение', '')}\n"
514
- content += f"Раздел: {row.get('Раздел документа', '')}\n"
515
-
516
- chunk_size = len(content)
517
-
518
- documents.append(Document(
519
- text=content,
520
- metadata={
521
- 'type': 'image',
522
- 'document_id': str(row.get('Обозначение документа', 'unknown')),
523
- 'image_number': str(row.get('№ Изображения', 'unknown')),
524
- 'section': str(row.get('Раздел документа', '')),
525
- 'chunk_size': chunk_size
526
- }
527
- ))
528
- except Exception as e:
529
- log_message(f"Error loading {file_path}: {e}")
530
-
531
- if documents:
532
- avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
533
- log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
534
-
535
- return documents
536
-
537
- def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
538
- log_message("="*60)
539
- log_message("STARTING DOCUMENT LOADING")
540
- log_message("="*60)
541
-
542
- # Load text sections
543
- text_docs = load_json_documents(repo_id, hf_token, json_dir)
544
- text_chunks = chunk_text_documents(text_docs)
545
-
546
- # Load tables (already chunked)
547
- table_chunks = load_table_documents(repo_id, hf_token, table_dir)
548
-
549
- # NEW: Analyze connection types in tables
550
- connection_types = {}
551
- for chunk in table_chunks:
552
- conn_type = chunk.metadata.get('connection_type', '')
553
- if conn_type:
554
- connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
555
-
556
- log_message("="*60)
557
- log_message("CONNECTION TYPES FOUND IN TABLES:")
558
- for conn_type, count in sorted(connection_types.items()):
559
- log_message(f" {conn_type}: {count} chunks")
560
- log_message("="*60)
561
-
562
- # Load images (no chunking needed)
563
- image_docs = load_image_documents(repo_id, hf_token, image_dir)
564
-
565
- all_docs = text_chunks + table_chunks + image_docs
566
-
567
- log_message("="*60)
568
- log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
569
- log_message(f" Text chunks: {len(text_chunks)}")
570
- log_message(f" Table chunks: {len(table_chunks)}")
571
- log_message(f" Images: {len(image_docs)}")
572
- log_message("="*60)
573
-
574
  return all_docs
 
1
+ import json
2
+ import zipfile
3
+ import pandas as pd
4
+ from huggingface_hub import hf_hub_download, list_repo_files
5
+ from llama_index.core import Document
6
+ from llama_index.core.text_splitter import SentenceSplitter
7
+ from my_logging import log_message
8
+ from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
9
+
10
+ def chunk_text_documents(documents):
11
+ text_splitter = SentenceSplitter(
12
+ chunk_size=CHUNK_SIZE,
13
+ chunk_overlap=CHUNK_OVERLAP
14
+ )
15
+
16
+ chunked = []
17
+ for doc in documents:
18
+ chunks = text_splitter.get_nodes_from_documents([doc])
19
+ for i, chunk in enumerate(chunks):
20
+ chunk.metadata.update({
21
+ 'chunk_id': i,
22
+ 'total_chunks': len(chunks),
23
+ 'chunk_size': len(chunk.text) # Add chunk size
24
+ })
25
+ chunked.append(chunk)
26
+
27
+ # Log statistics
28
+ if chunked:
29
+ avg_size = sum(len(c.text) for c in chunked) / len(chunked)
30
+ min_size = min(len(c.text) for c in chunked)
31
+ max_size = max(len(c.text) for c in chunked)
32
+ log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
33
+ log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
34
+
35
+ return chunked
36
+
37
+
38
+ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
39
+ headers = table_data.get('headers', [])
40
+ rows = table_data.get('data', [])
41
+ table_num = table_data.get('table_number', 'unknown')
42
+ table_title = table_data.get('table_title', '')
43
+ section = table_data.get('section', '')
44
+
45
+ table_num_clean = str(table_num).strip()
46
+
47
+ import re
48
+ if 'приложени' in section.lower():
49
+ appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
50
+ if appendix_match:
51
+ appendix_num = appendix_match.group(1).upper()
52
+ table_identifier = f"{table_num_clean} Приложение {appendix_num}"
53
+ else:
54
+ table_identifier = table_num_clean
55
+ else:
56
+ table_identifier = table_num_clean
57
+
58
+ if not rows:
59
+ return []
60
+
61
+ log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
62
+
63
+ # Calculate base metadata size
64
+ base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
65
+ base_size = len(base_content)
66
+ available_space = max_chars - base_size - 200
67
+
68
+ # If entire table fits, return as one chunk
69
+ full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
70
+ if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
71
+ content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
72
+
73
+ metadata = {
74
+ 'type': 'table',
75
+ 'document_id': doc_id,
76
+ 'table_number': table_num_clean,
77
+ 'table_identifier': table_identifier,
78
+ 'table_title': table_title,
79
+ 'section': section,
80
+ 'total_rows': len(rows),
81
+ 'chunk_size': len(content),
82
+ 'is_complete_table': True
83
+ }
84
+
85
+ log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
86
+ return [Document(text=content, metadata=metadata)]
87
+
88
+ chunks = []
89
+ current_rows = []
90
+ current_size = 0
91
+ chunk_num = 0
92
+
93
+ for i, row in enumerate(rows):
94
+ row_text = format_single_row(row, i + 1)
95
+ row_size = len(row_text)
96
+
97
+ should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
98
+
99
+ if should_split:
100
+ content = base_content + format_table_rows(current_rows)
101
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
102
+ content += format_table_footer(table_identifier, doc_id)
103
+
104
+ metadata = {
105
+ 'type': 'table',
106
+ 'document_id': doc_id,
107
+ 'table_number': table_num_clean,
108
+ 'table_identifier': table_identifier,
109
+ 'table_title': table_title,
110
+ 'section': section,
111
+ 'chunk_id': chunk_num,
112
+ 'row_start': current_rows[0]['_idx'] - 1,
113
+ 'row_end': current_rows[-1]['_idx'],
114
+ 'total_rows': len(rows),
115
+ 'chunk_size': len(content),
116
+ 'is_complete_table': False
117
+ }
118
+
119
+ chunks.append(Document(text=content, metadata=metadata))
120
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
121
+
122
+ chunk_num += 1
123
+ current_rows = []
124
+ current_size = 0
125
+
126
+ # Add row with index
127
+ row_copy = row.copy() if isinstance(row, dict) else {'data': row}
128
+ row_copy['_idx'] = i + 1
129
+ current_rows.append(row_copy)
130
+ current_size += row_size
131
+
132
+ # Add final chunk
133
+ if current_rows:
134
+ content = base_content + format_table_rows(current_rows)
135
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
136
+ content += format_table_footer(table_identifier, doc_id)
137
+
138
+ metadata = {
139
+ 'type': 'table',
140
+ 'document_id': doc_id,
141
+ 'table_number': table_num_clean,
142
+ 'table_identifier': table_identifier,
143
+ 'table_title': table_title,
144
+ 'section': section,
145
+ 'chunk_id': chunk_num,
146
+ 'row_start': current_rows[0]['_idx'] - 1,
147
+ 'row_end': current_rows[-1]['_idx'],
148
+ 'total_rows': len(rows),
149
+ 'chunk_size': len(content),
150
+ 'is_complete_table': False
151
+ }
152
+
153
+ chunks.append(Document(text=content, metadata=metadata))
154
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
155
+
156
+ return chunks
157
+
158
+
159
+ def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
+ content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
161
+
162
+ # Add table type/number prominently for matching
163
+ if table_num:
164
+ content += f"ТИП: {table_num}\n"
165
+
166
+ if table_title:
167
+ content += f"НАЗВАНИЕ: {table_title}\n"
168
+
169
+ if section:
170
+ content += f"РАЗДЕЛ: {section}\n"
171
+
172
+ content += f"{'='*70}\n"
173
+
174
+ if headers:
175
+ header_str = ' | '.join(str(h) for h in headers)
176
+ content += f"ЗАГОЛОВКИ: {header_str}\n\n"
177
+
178
+ content += "ДАННЫЕ:\n"
179
+ return content
180
+
181
+
182
+ def format_single_row(row, idx):
183
+ """Format a single row"""
184
+ if isinstance(row, dict):
185
+ parts = [f"{k}: {v}" for k, v in row.items()
186
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
187
+ if parts:
188
+ return f"{idx}. {' | '.join(parts)}\n"
189
+ elif isinstance(row, list):
190
+ parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
191
+ if parts:
192
+ return f"{idx}. {' | '.join(parts)}\n"
193
+ return ""
194
+
195
+
196
+ def format_table_rows(rows):
197
+ """Format multiple rows"""
198
+ content = ""
199
+ for row in rows:
200
+ idx = row.get('_idx', 0)
201
+ content += format_single_row(row, idx)
202
+ return content
203
+
204
+
205
+ def format_table_footer(table_identifier, doc_id):
206
+ """Format table footer"""
207
+ return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
208
+
209
+ def load_json_documents(repo_id, hf_token, json_dir):
210
+ import zipfile
211
+ import tempfile
212
+ import os
213
+
214
+ log_message("Loading JSON documents...")
215
+
216
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
217
+ json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
218
+ zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
219
+
220
+ log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
221
+
222
+ documents = []
223
+ stats = {'success': 0, 'failed': 0, 'empty': 0}
224
+
225
+ for file_path in json_files:
226
+ try:
227
+ log_message(f" Loading: {file_path}")
228
+ local_path = hf_hub_download(
229
+ repo_id=repo_id,
230
+ filename=file_path,
231
+ repo_type="dataset",
232
+ token=hf_token
233
+ )
234
+
235
+ docs = extract_sections_from_json(local_path)
236
+ if docs:
237
+ documents.extend(docs)
238
+ stats['success'] += 1
239
+ log_message(f" ✓ Extracted {len(docs)} sections")
240
+ else:
241
+ stats['empty'] += 1
242
+ log_message(f" ⚠ No sections found")
243
+
244
+ except Exception as e:
245
+ stats['failed'] += 1
246
+ log_message(f" Error: {e}")
247
+
248
+ for zip_path in zip_files:
249
+ try:
250
+ log_message(f" Processing ZIP: {zip_path}")
251
+ local_zip = hf_hub_download(
252
+ repo_id=repo_id,
253
+ filename=zip_path,
254
+ repo_type="dataset",
255
+ token=hf_token
256
+ )
257
+
258
+ with zipfile.ZipFile(local_zip, 'r') as zf:
259
+ json_files_in_zip = [f for f in zf.namelist()
260
+ if f.endswith('.json')
261
+ and not f.startswith('__MACOSX')
262
+ and not f.startswith('.')
263
+ and not '._' in f]
264
+
265
+ log_message(f" Found {len(json_files_in_zip)} JSON files in ZIP")
266
+
267
+ for json_file in json_files_in_zip:
268
+ try:
269
+ file_content = zf.read(json_file)
270
+
271
+ # Skip if file is too small
272
+ if len(file_content) < 10:
273
+ log_message(f" ✗ Skipping: {json_file} (file too small)")
274
+ stats['failed'] += 1
275
+ continue
276
+
277
+ # Try UTF-8 first (most common)
278
+ try:
279
+ text_content = file_content.decode('utf-8')
280
+ except UnicodeDecodeError:
281
+ try:
282
+ text_content = file_content.decode('utf-8-sig')
283
+ except UnicodeDecodeError:
284
+ try:
285
+ # Try UTF-16 (the issue you're seeing)
286
+ text_content = file_content.decode('utf-16')
287
+ except UnicodeDecodeError:
288
+ try:
289
+ text_content = file_content.decode('windows-1251')
290
+ except UnicodeDecodeError:
291
+ log_message(f" ✗ Skipping: {json_file} (encoding failed)")
292
+ stats['failed'] += 1
293
+ continue
294
+
295
+ # Validate JSON structure
296
+ if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
297
+ log_message(f" Skipping: {json_file} (not valid JSON)")
298
+ stats['failed'] += 1
299
+ continue
300
+
301
+ with tempfile.NamedTemporaryFile(mode='w', delete=False,
302
+ suffix='.json', encoding='utf-8') as tmp:
303
+ tmp.write(text_content)
304
+ tmp_path = tmp.name
305
+
306
+ docs = extract_sections_from_json(tmp_path)
307
+ if docs:
308
+ documents.extend(docs)
309
+ stats['success'] += 1
310
+ log_message(f" ✓ {json_file}: {len(docs)} sections")
311
+ else:
312
+ stats['empty'] += 1
313
+ log_message(f" ⚠ {json_file}: No sections")
314
+
315
+ os.unlink(tmp_path)
316
+
317
+ except json.JSONDecodeError as e:
318
+ stats['failed'] += 1
319
+ log_message(f" ✗ {json_file}: Invalid JSON")
320
+ except Exception as e:
321
+ stats['failed'] += 1
322
+ log_message(f" ✗ {json_file}: {str(e)[:100]}")
323
+
324
+ except Exception as e:
325
+ log_message(f" ✗ Error with ZIP: {e}")
326
+
327
+ log_message(f"="*60)
328
+ log_message(f"JSON Loading Stats:")
329
+ log_message(f" Success: {stats['success']}")
330
+ log_message(f" Empty: {stats['empty']}")
331
+ log_message(f" Failed: {stats['failed']}")
332
+ log_message(f" Total sections: {len(documents)}")
333
+ log_message(f"="*60)
334
+
335
+ return documents
336
+
337
+ def extract_sections_from_json(json_path):
338
+ """Extract sections from a single JSON file"""
339
+ documents = []
340
+
341
+ try:
342
+ with open(json_path, 'r', encoding='utf-8') as f:
343
+ data = json.load(f)
344
+
345
+ doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
346
+
347
+ # Extract all section levels
348
+ for section in data.get('sections', []):
349
+ if section.get('section_text', '').strip():
350
+ documents.append(Document(
351
+ text=section['section_text'],
352
+ metadata={
353
+ 'type': 'text',
354
+ 'document_id': doc_id,
355
+ 'section_id': section.get('section_id', '')
356
+ }
357
+ ))
358
+
359
+ # Subsections
360
+ for subsection in section.get('subsections', []):
361
+ if subsection.get('subsection_text', '').strip():
362
+ documents.append(Document(
363
+ text=subsection['subsection_text'],
364
+ metadata={
365
+ 'type': 'text',
366
+ 'document_id': doc_id,
367
+ 'section_id': subsection.get('subsection_id', '')
368
+ }
369
+ ))
370
+
371
+ # Sub-subsections
372
+ for sub_sub in subsection.get('sub_subsections', []):
373
+ if sub_sub.get('sub_subsection_text', '').strip():
374
+ documents.append(Document(
375
+ text=sub_sub['sub_subsection_text'],
376
+ metadata={
377
+ 'type': 'text',
378
+ 'document_id': doc_id,
379
+ 'section_id': sub_sub.get('sub_subsection_id', '')
380
+ }
381
+ ))
382
+
383
+ except Exception as e:
384
+ log_message(f"Error extracting from {json_path}: {e}")
385
+
386
+ return documents
387
+
388
+
389
+ def load_table_documents(repo_id, hf_token, table_dir):
390
+ log_message("Loading tables...")
391
+
392
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
393
+ table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
394
+
395
+ all_chunks = []
396
+ for file_path in table_files:
397
+ try:
398
+ local_path = hf_hub_download(
399
+ repo_id=repo_id,
400
+ filename=file_path,
401
+ repo_type="dataset",
402
+ token=hf_token
403
+ )
404
+
405
+ with open(local_path, 'r', encoding='utf-8') as f:
406
+ data = json.load(f)
407
+
408
+ file_doc_id = data.get('document_id', data.get('document', 'unknown'))
409
+
410
+ for sheet in data.get('sheets', []):
411
+ sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
412
+
413
+ # Use the consistent MAX_CHARS_TABLE from config
414
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
415
+ all_chunks.extend(chunks)
416
+
417
+ except Exception as e:
418
+ log_message(f"Error loading {file_path}: {e}")
419
+
420
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks")
421
+ return all_chunks
422
+
423
+
424
+ def load_image_documents(repo_id, hf_token, image_dir):
425
+ """Load image descriptions"""
426
+ log_message("Loading images...")
427
+
428
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
429
+ csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
430
+
431
+ documents = []
432
+ for file_path in csv_files:
433
+ try:
434
+ local_path = hf_hub_download(
435
+ repo_id=repo_id,
436
+ filename=file_path,
437
+ repo_type="dataset",
438
+ token=hf_token
439
+ )
440
+
441
+ df = pd.read_csv(local_path)
442
+
443
+ for _, row in df.iterrows():
444
+ content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
445
+ content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
446
+ content += f"Название: {row.get('Название изображения', '')}\n"
447
+ content += f"Описание: {row.get('Описание изображение', '')}\n"
448
+ content += f"Раздел: {row.get('Раздел документа', '')}\n"
449
+
450
+ chunk_size = len(content)
451
+
452
+ documents.append(Document(
453
+ text=content,
454
+ metadata={
455
+ 'type': 'image',
456
+ 'document_id': str(row.get('Обозначение документа', 'unknown')),
457
+ 'image_number': str(row.get('№ Изображения', 'unknown')),
458
+ 'section': str(row.get('Раздел документа', '')),
459
+ 'chunk_size': chunk_size
460
+ }
461
+ ))
462
+ except Exception as e:
463
+ log_message(f"Error loading {file_path}: {e}")
464
+
465
+ if documents:
466
+ avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
467
+ log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
468
+
469
+ return documents
470
+
471
+
472
+ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
473
+ """Main loader - combines all document types"""
474
+ log_message("="*60)
475
+ log_message("STARTING DOCUMENT LOADING")
476
+ log_message("="*60)
477
+
478
+ # Load text sections
479
+ text_docs = load_json_documents(repo_id, hf_token, json_dir)
480
+ text_chunks = chunk_text_documents(text_docs)
481
+
482
+ # Load tables (already chunked)
483
+ table_chunks = load_table_documents(repo_id, hf_token, table_dir)
484
+
485
+ # Load images (no chunking needed)
486
+ image_docs = load_image_documents(repo_id, hf_token, image_dir)
487
+
488
+ all_docs = text_chunks + table_chunks + image_docs
489
+
490
+ log_message("="*60)
491
+ log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
492
+ log_message(f" Text chunks: {len(text_chunks)}")
493
+ log_message(f" Table chunks: {len(table_chunks)}")
494
+ log_message(f" Images: {len(image_docs)}")
495
+ log_message("="*60)
496
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  return all_docs