MrSimple01 commited on
Commit
a2e9ee2
·
verified ·
1 Parent(s): fbc8fb0

Update documents_prep.py

Browse files
Files changed (1) hide show
  1. documents_prep.py +663 -512
documents_prep.py CHANGED
@@ -1,513 +1,664 @@
1
- import json
2
- import zipfile
3
- import pandas as pd
4
- from huggingface_hub import hf_hub_download, list_repo_files
5
- from llama_index.core import Document
6
- from llama_index.core.text_splitter import SentenceSplitter
7
- from my_logging import log_message
8
- from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
9
-
10
- def chunk_text_documents(documents):
11
- text_splitter = SentenceSplitter(
12
- chunk_size=CHUNK_SIZE,
13
- chunk_overlap=CHUNK_OVERLAP
14
- )
15
-
16
- chunked = []
17
- for doc in documents:
18
- chunks = text_splitter.get_nodes_from_documents([doc])
19
- for i, chunk in enumerate(chunks):
20
- chunk.metadata.update({
21
- 'chunk_id': i,
22
- 'total_chunks': len(chunks),
23
- 'chunk_size': len(chunk.text) # Add chunk size
24
- })
25
- chunked.append(chunk)
26
-
27
- # Log statistics
28
- if chunked:
29
- avg_size = sum(len(c.text) for c in chunked) / len(chunked)
30
- min_size = min(len(c.text) for c in chunked)
31
- max_size = max(len(c.text) for c in chunked)
32
- log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
33
- log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
34
-
35
- return chunked
36
-
37
- def normalize_text(text):
38
- if not text:
39
- return text
40
-
41
- # Replace Cyrillic 'C' with Latin 'С' (U+0421)
42
- # This is for welding types like C-25 -> С-25
43
- text = text.replace('С-', 'C')
44
-
45
- # Also handle cases like "Type C" or variations
46
- import re
47
- # Match "C" followed by digit or space in context of welding types
48
- text = re.sub(r'\bС(\d)', r'С\1', text)
49
-
50
- return text
51
-
52
- def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
53
- headers = table_data.get('headers', [])
54
- rows = table_data.get('data', [])
55
- table_num = table_data.get('table_number', 'unknown')
56
- table_title = table_data.get('table_title', '')
57
- section = table_data.get('section', '')
58
-
59
- table_num_clean = str(table_num).strip()
60
- table_title_normalized = normalize_text(str(table_title)) # NORMALIZE TITLE
61
-
62
- import re
63
- if 'приложени' in section.lower():
64
- appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
65
- if appendix_match:
66
- appendix_num = appendix_match.group(1).upper()
67
- table_identifier = f"{table_num_clean} Приложение {appendix_num}"
68
- else:
69
- table_identifier = table_num_clean
70
- else:
71
- table_identifier = table_num_clean
72
-
73
- if not rows:
74
- return []
75
-
76
- log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
77
-
78
- # Calculate base metadata size with NORMALIZED title
79
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
80
- base_size = len(base_content)
81
- available_space = max_chars - base_size - 200
82
-
83
- # If entire table fits, return as one chunk
84
- full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
85
- if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
86
- content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
87
-
88
- metadata = {
89
- 'type': 'table',
90
- 'document_id': doc_id,
91
- 'table_number': table_num_clean,
92
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE identifier
93
- 'table_title': table_title_normalized, # NORMALIZED
94
- 'section': section,
95
- 'total_rows': len(rows),
96
- 'chunk_size': len(content),
97
- 'is_complete_table': True
98
- }
99
-
100
- log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
101
- return [Document(text=content, metadata=metadata)]
102
-
103
- chunks = []
104
- current_rows = []
105
- current_size = 0
106
- chunk_num = 0
107
-
108
- for i, row in enumerate(rows):
109
- row_text = format_single_row(row, i + 1)
110
- row_size = len(row_text)
111
-
112
- should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
113
-
114
- if should_split:
115
- content = base_content + format_table_rows(current_rows)
116
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
117
- content += format_table_footer(table_identifier, doc_id)
118
-
119
- metadata = {
120
- 'type': 'table',
121
- 'document_id': doc_id,
122
- 'table_number': table_num_clean,
123
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE
124
- 'table_title': table_title_normalized, # NORMALIZED
125
- 'section': section,
126
- 'chunk_id': chunk_num,
127
- 'row_start': current_rows[0]['_idx'] - 1,
128
- 'row_end': current_rows[-1]['_idx'],
129
- 'total_rows': len(rows),
130
- 'chunk_size': len(content),
131
- 'is_complete_table': False
132
- }
133
-
134
- chunks.append(Document(text=content, metadata=metadata))
135
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
136
-
137
- chunk_num += 1
138
- current_rows = []
139
- current_size = 0
140
-
141
- # Add row with index
142
- row_copy = row.copy() if isinstance(row, dict) else {'data': row}
143
- row_copy['_idx'] = i + 1
144
- current_rows.append(row_copy)
145
- current_size += row_size
146
-
147
- # Add final chunk
148
- if current_rows:
149
- content = base_content + format_table_rows(current_rows)
150
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
151
- content += format_table_footer(table_identifier, doc_id)
152
-
153
- metadata = {
154
- 'type': 'table',
155
- 'document_id': doc_id,
156
- 'table_number': table_num_clean,
157
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE
158
- 'table_title': table_title_normalized, # NORMALIZED
159
- 'section': section,
160
- 'chunk_id': chunk_num,
161
- 'row_start': current_rows[0]['_idx'] - 1,
162
- 'row_end': current_rows[-1]['_idx'],
163
- 'total_rows': len(rows),
164
- 'chunk_size': len(content),
165
- 'is_complete_table': False
166
- }
167
-
168
- chunks.append(Document(text=content, metadata=metadata))
169
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
170
-
171
- return chunks
172
-
173
-
174
- # MODIFIED: Update format_table_header function
175
- def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
176
- content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
177
-
178
- # Add table type/number prominently for matching
179
- if table_num:
180
- content += f"ТИП: {normalize_text(table_num)}\n"
181
-
182
- if table_title:
183
- content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
184
-
185
- if section:
186
- content += f"РАЗДЕЛ: {section}\n"
187
-
188
- content += f"{'='*70}\n"
189
-
190
- if headers:
191
- header_str = ' | '.join(str(h) for h in headers)
192
- content += f"ЗАГОЛОВКИ: {header_str}\n\n"
193
-
194
- content += "ДАННЫЕ:\n"
195
- return content
196
-
197
-
198
- def format_single_row(row, idx):
199
- """Format a single row"""
200
- if isinstance(row, dict):
201
- parts = [f"{k}: {v}" for k, v in row.items()
202
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
203
- if parts:
204
- return f"{idx}. {' | '.join(parts)}\n"
205
- elif isinstance(row, list):
206
- parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
207
- if parts:
208
- return f"{idx}. {' | '.join(parts)}\n"
209
- return ""
210
-
211
-
212
- def format_table_rows(rows):
213
- """Format multiple rows"""
214
- content = ""
215
- for row in rows:
216
- idx = row.get('_idx', 0)
217
- content += format_single_row(row, idx)
218
- return content
219
-
220
-
221
- def format_table_footer(table_identifier, doc_id):
222
- """Format table footer"""
223
- return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
224
-
225
- def load_json_documents(repo_id, hf_token, json_dir):
226
- import zipfile
227
- import tempfile
228
- import os
229
-
230
- log_message("Loading JSON documents...")
231
-
232
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
233
- json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
234
- zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
235
-
236
- log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
237
-
238
- documents = []
239
- stats = {'success': 0, 'failed': 0, 'empty': 0}
240
-
241
- for file_path in json_files:
242
- try:
243
- log_message(f" Loading: {file_path}")
244
- local_path = hf_hub_download(
245
- repo_id=repo_id,
246
- filename=file_path,
247
- repo_type="dataset",
248
- token=hf_token
249
- )
250
-
251
- docs = extract_sections_from_json(local_path)
252
- if docs:
253
- documents.extend(docs)
254
- stats['success'] += 1
255
- log_message(f" ✓ Extracted {len(docs)} sections")
256
- else:
257
- stats['empty'] += 1
258
- log_message(f" ⚠ No sections found")
259
-
260
- except Exception as e:
261
- stats['failed'] += 1
262
- log_message(f" ✗ Error: {e}")
263
-
264
- for zip_path in zip_files:
265
- try:
266
- log_message(f" Processing ZIP: {zip_path}")
267
- local_zip = hf_hub_download(
268
- repo_id=repo_id,
269
- filename=zip_path,
270
- repo_type="dataset",
271
- token=hf_token
272
- )
273
-
274
- with zipfile.ZipFile(local_zip, 'r') as zf:
275
- json_files_in_zip = [f for f in zf.namelist()
276
- if f.endswith('.json')
277
- and not f.startswith('__MACOSX')
278
- and not f.startswith('.')
279
- and not '._' in f]
280
-
281
- log_message(f" Found {len(json_files_in_zip)} JSON files in ZIP")
282
-
283
- for json_file in json_files_in_zip:
284
- try:
285
- file_content = zf.read(json_file)
286
-
287
- # Skip if file is too small
288
- if len(file_content) < 10:
289
- log_message(f" ✗ Skipping: {json_file} (file too small)")
290
- stats['failed'] += 1
291
- continue
292
-
293
- # Try UTF-8 first (most common)
294
- try:
295
- text_content = file_content.decode('utf-8')
296
- except UnicodeDecodeError:
297
- try:
298
- text_content = file_content.decode('utf-8-sig')
299
- except UnicodeDecodeError:
300
- try:
301
- # Try UTF-16 (the issue you're seeing)
302
- text_content = file_content.decode('utf-16')
303
- except UnicodeDecodeError:
304
- try:
305
- text_content = file_content.decode('windows-1251')
306
- except UnicodeDecodeError:
307
- log_message(f" ✗ Skipping: {json_file} (encoding failed)")
308
- stats['failed'] += 1
309
- continue
310
-
311
- # Validate JSON structure
312
- if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
313
- log_message(f" ✗ Skipping: {json_file} (not valid JSON)")
314
- stats['failed'] += 1
315
- continue
316
-
317
- with tempfile.NamedTemporaryFile(mode='w', delete=False,
318
- suffix='.json', encoding='utf-8') as tmp:
319
- tmp.write(text_content)
320
- tmp_path = tmp.name
321
-
322
- docs = extract_sections_from_json(tmp_path)
323
- if docs:
324
- documents.extend(docs)
325
- stats['success'] += 1
326
- log_message(f" ✓ {json_file}: {len(docs)} sections")
327
- else:
328
- stats['empty'] += 1
329
- log_message(f" ⚠ {json_file}: No sections")
330
-
331
- os.unlink(tmp_path)
332
-
333
- except json.JSONDecodeError as e:
334
- stats['failed'] += 1
335
- log_message(f" ✗ {json_file}: Invalid JSON")
336
- except Exception as e:
337
- stats['failed'] += 1
338
- log_message(f" ✗ {json_file}: {str(e)[:100]}")
339
-
340
- except Exception as e:
341
- log_message(f" ✗ Error with ZIP: {e}")
342
-
343
- log_message(f"="*60)
344
- log_message(f"JSON Loading Stats:")
345
- log_message(f" Success: {stats['success']}")
346
- log_message(f" Empty: {stats['empty']}")
347
- log_message(f" Failed: {stats['failed']}")
348
- log_message(f" Total sections: {len(documents)}")
349
- log_message(f"="*60)
350
-
351
- return documents
352
-
353
- def extract_sections_from_json(json_path):
354
- """Extract sections from a single JSON file"""
355
- documents = []
356
-
357
- try:
358
- with open(json_path, 'r', encoding='utf-8') as f:
359
- data = json.load(f)
360
-
361
- doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
362
-
363
- # Extract all section levels
364
- for section in data.get('sections', []):
365
- if section.get('section_text', '').strip():
366
- documents.append(Document(
367
- text=section['section_text'],
368
- metadata={
369
- 'type': 'text',
370
- 'document_id': doc_id,
371
- 'section_id': section.get('section_id', '')
372
- }
373
- ))
374
-
375
- # Subsections
376
- for subsection in section.get('subsections', []):
377
- if subsection.get('subsection_text', '').strip():
378
- documents.append(Document(
379
- text=subsection['subsection_text'],
380
- metadata={
381
- 'type': 'text',
382
- 'document_id': doc_id,
383
- 'section_id': subsection.get('subsection_id', '')
384
- }
385
- ))
386
-
387
- # Sub-subsections
388
- for sub_sub in subsection.get('sub_subsections', []):
389
- if sub_sub.get('sub_subsection_text', '').strip():
390
- documents.append(Document(
391
- text=sub_sub['sub_subsection_text'],
392
- metadata={
393
- 'type': 'text',
394
- 'document_id': doc_id,
395
- 'section_id': sub_sub.get('sub_subsection_id', '')
396
- }
397
- ))
398
-
399
- except Exception as e:
400
- log_message(f"Error extracting from {json_path}: {e}")
401
-
402
- return documents
403
-
404
-
405
- def load_table_documents(repo_id, hf_token, table_dir):
406
- log_message("Loading tables...")
407
-
408
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
409
- table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
410
-
411
- all_chunks = []
412
- for file_path in table_files:
413
- try:
414
- local_path = hf_hub_download(
415
- repo_id=repo_id,
416
- filename=file_path,
417
- repo_type="dataset",
418
- token=hf_token
419
- )
420
-
421
- with open(local_path, 'r', encoding='utf-8') as f:
422
- data = json.load(f)
423
-
424
- file_doc_id = data.get('document_id', data.get('document', 'unknown'))
425
-
426
- for sheet in data.get('sheets', []):
427
- sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
428
-
429
- # Use the consistent MAX_CHARS_TABLE from config
430
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
431
- all_chunks.extend(chunks)
432
-
433
- except Exception as e:
434
- log_message(f"Error loading {file_path}: {e}")
435
-
436
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
437
- return all_chunks
438
-
439
-
440
- def load_image_documents(repo_id, hf_token, image_dir):
441
- """Load image descriptions"""
442
- log_message("Loading images...")
443
-
444
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
445
- csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
446
-
447
- documents = []
448
- for file_path in csv_files:
449
- try:
450
- local_path = hf_hub_download(
451
- repo_id=repo_id,
452
- filename=file_path,
453
- repo_type="dataset",
454
- token=hf_token
455
- )
456
-
457
- df = pd.read_csv(local_path)
458
-
459
- for _, row in df.iterrows():
460
- content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
461
- content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
462
- content += f"Название: {row.get('Название изображения', '')}\n"
463
- content += f"Описание: {row.get('Описание изображение', '')}\n"
464
- content += f"Раздел: {row.get('Раздел документа', '')}\n"
465
-
466
- chunk_size = len(content)
467
-
468
- documents.append(Document(
469
- text=content,
470
- metadata={
471
- 'type': 'image',
472
- 'document_id': str(row.get('Обозначение документа', 'unknown')),
473
- 'image_number': str(row.get('№ Изображения', 'unknown')),
474
- 'section': str(row.get('Раздел документа', '')),
475
- 'chunk_size': chunk_size
476
- }
477
- ))
478
- except Exception as e:
479
- log_message(f"Error loading {file_path}: {e}")
480
-
481
- if documents:
482
- avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
483
- log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
484
-
485
- return documents
486
-
487
-
488
- def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
489
- """Main loader - combines all document types"""
490
- log_message("="*60)
491
- log_message("STARTING DOCUMENT LOADING")
492
- log_message("="*60)
493
-
494
- # Load text sections
495
- text_docs = load_json_documents(repo_id, hf_token, json_dir)
496
- text_chunks = chunk_text_documents(text_docs)
497
-
498
- # Load tables (already chunked)
499
- table_chunks = load_table_documents(repo_id, hf_token, table_dir)
500
-
501
- # Load images (no chunking needed)
502
- image_docs = load_image_documents(repo_id, hf_token, image_dir)
503
-
504
- all_docs = text_chunks + table_chunks + image_docs
505
-
506
- log_message("="*60)
507
- log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
508
- log_message(f" Text chunks: {len(text_chunks)}")
509
- log_message(f" Table chunks: {len(table_chunks)}")
510
- log_message(f" Images: {len(image_docs)}")
511
- log_message("="*60)
512
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  return all_docs
 
1
+ import json
2
+ import zipfile
3
+ import pandas as pd
4
+ from huggingface_hub import hf_hub_download, list_repo_files
5
+ from llama_index.core import Document
6
+ from llama_index.core.text_splitter import SentenceSplitter
7
+ from my_logging import log_message
8
+ from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
9
+
10
+ def normalize_text(text):
11
+ if not text:
12
+ return text
13
+
14
+ # Replace Cyrillic 'C' with Latin 'С' (U+0421)
15
+ # This is for welding types like C-25 -> С-25
16
+ text = text.replace('С-', 'C')
17
+
18
+ # Also handle cases like "Type C" or variations
19
+ import re
20
+ # Match "C" followed by digit or space in context of welding types
21
+ text = re.sub(r'\bС(\d)', r'С\1', text)
22
+
23
+ return text
24
+
25
+ import re
26
+
27
+ def normalize_steel_designations(text):
28
+ """
29
+ Normalize steel designations by converting Latin letters to Cyrillic.
30
+ Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
31
+ Returns: (normalized_text, changes_count, changes_list)
32
+ """
33
+ if not text:
34
+ return text, 0, []
35
+
36
+ changes_count = 0
37
+ changes_list = []
38
+
39
+ # Mapping of Latin to Cyrillic for steel designations
40
+ replacements = {
41
+ 'X': 'Х',
42
+ 'H': 'Н',
43
+ 'T': 'Т',
44
+ 'C': 'С',
45
+ 'B': 'В',
46
+ 'K': 'К',
47
+ 'M': 'М',
48
+ 'A': 'А',
49
+ 'P': 'Р',
50
+ }
51
+
52
+ # Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
53
+ # \b\d{1,3} — starts with 1–3 digits
54
+ # (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
55
+ pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
56
+
57
+ def replace_in_steel_grade(match):
58
+ nonlocal changes_count, changes_list
59
+ original = match.group(0)
60
+ converted = ''.join(replacements.get(ch, ch) for ch in original)
61
+ if converted != original:
62
+ changes_count += 1
63
+ changes_list.append(f"{original} {converted}")
64
+ return converted
65
+
66
+ normalized_text = re.sub(pattern, replace_in_steel_grade, text)
67
+
68
+ return normalized_text, changes_count, changes_list
69
+
70
+
71
+
72
+ def chunk_text_documents(documents):
73
+ text_splitter = SentenceSplitter(
74
+ chunk_size=CHUNK_SIZE,
75
+ chunk_overlap=CHUNK_OVERLAP
76
+ )
77
+
78
+ log_message("="*60)
79
+ log_message("NORMALIZING STEEL DESIGNATIONS IN TEXT CHUNKS")
80
+
81
+ total_normalizations = 0
82
+ chunks_with_changes = 0
83
+
84
+ chunked = []
85
+ for doc in documents:
86
+ chunks = text_splitter.get_nodes_from_documents([doc])
87
+ for i, chunk in enumerate(chunks):
88
+ # Normalize steel designations in the chunk text
89
+ original_text = chunk.text
90
+ chunk.text, changes, change_list = normalize_steel_designations(chunk.text) # FIX: 3 values
91
+
92
+ if changes > 0:
93
+ chunks_with_changes += 1
94
+ total_normalizations += changes
95
+
96
+ chunk.metadata.update({
97
+ 'chunk_id': i,
98
+ 'total_chunks': len(chunks),
99
+ 'chunk_size': len(chunk.text)
100
+ })
101
+ chunked.append(chunk)
102
+
103
+ # Log statistics
104
+ if chunked:
105
+ avg_size = sum(len(c.text) for c in chunked) / len(chunked)
106
+ min_size = min(len(c.text) for c in chunked)
107
+ max_size = max(len(c.text) for c in chunked)
108
+ log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
109
+ log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
110
+ log_message(f" Steel designation normalization:")
111
+ log_message(f" - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
112
+ log_message(f" - Total steel grades normalized: {total_normalizations}")
113
+ log_message(f" - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else " - No normalizations needed")
114
+
115
+ log_message("="*60)
116
+
117
+ return chunked
118
+
119
+
120
+ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
121
+ headers = table_data.get('headers', [])
122
+ rows = table_data.get('data', [])
123
+ table_num = table_data.get('table_number', 'unknown')
124
+ table_title = table_data.get('table_title', '')
125
+ section = table_data.get('section', '')
126
+ sheet_name = table_data.get('sheet_name', '')
127
+
128
+ # Apply steel designation normalization to title and section
129
+ table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
130
+ section, section_changes, section_list = normalize_steel_designations(section)
131
+
132
+ table_num_clean = str(table_num).strip()
133
+
134
+ import re
135
+
136
+ if table_num_clean in ['-', '', 'unknown', 'nan']:
137
+ if 'приложени' in sheet_name.lower() or 'приложени' in section.lower():
138
+ appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)',
139
+ (sheet_name + ' ' + section).lower())
140
+ if appendix_match:
141
+ appendix_num = appendix_match.group(1)
142
+ table_identifier = f"Приложение {appendix_num}"
143
+ else:
144
+ table_identifier = "Приложение"
145
+ else:
146
+ if table_title:
147
+ first_words = ' '.join(table_title.split()[:5])
148
+ table_identifier = f"{first_words}"
149
+ else:
150
+ table_identifier = section.split(',')[0] if section else "БезНомера"
151
+ else:
152
+ if 'приложени' in section.lower():
153
+ appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)', section.lower())
154
+ if appendix_match:
155
+ appendix_num = appendix_match.group(1)
156
+ table_identifier = f"{table_num_clean} Приложение {appendix_num}"
157
+ else:
158
+ table_identifier = table_num_clean
159
+ else:
160
+ table_identifier = table_num_clean
161
+
162
+ if not rows:
163
+ return []
164
+
165
+ log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
166
+
167
+ # Normalize all row content (including steel designations)
168
+ normalized_rows = []
169
+ total_row_changes = 0
170
+ rows_with_changes = 0
171
+ all_row_changes = [] # NEW
172
+
173
+ for row in rows:
174
+ if isinstance(row, dict):
175
+ normalized_row = {}
176
+ row_had_changes = False
177
+ for k, v in row.items():
178
+ normalized_val, changes, change_list = normalize_steel_designations(str(v))
179
+ normalized_row[k] = normalized_val
180
+ if changes > 0:
181
+ total_row_changes += changes
182
+ row_had_changes = True
183
+ all_row_changes.extend(change_list) # NEW
184
+ if row_had_changes:
185
+ rows_with_changes += 1
186
+ normalized_rows.append(normalized_row)
187
+ else:
188
+ normalized_rows.append(row)
189
+
190
+ # Log normalization stats with examples
191
+ if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
192
+ log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
193
+ f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
194
+
195
+ # NEW: Show examples of what changed
196
+ if title_list:
197
+ log_message(f" Title changes: {', '.join(title_list[:3])}")
198
+ if section_list:
199
+ log_message(f" Section changes: {', '.join(section_list[:3])}")
200
+ if all_row_changes:
201
+ log_message(f" Row examples: {', '.join(all_row_changes[:5])}")
202
+ # Continue with rest of existing logic using normalized_rows...
203
+ # Calculate base metadata size
204
+ base_content = format_table_header(doc_id, table_identifier, table_num,
205
+ table_title, section, headers,
206
+ sheet_name)
207
+ base_size = len(base_content)
208
+ available_space = max_chars - base_size - 200
209
+
210
+ # If entire table fits, return as one chunk
211
+ full_rows_content = format_table_rows([{**row, '_idx': i+1}
212
+ for i, row in enumerate(normalized_rows)])
213
+
214
+ if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
215
+ content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
216
+
217
+ metadata = {
218
+ 'type': 'table',
219
+ 'document_id': doc_id,
220
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
221
+ 'table_identifier': table_identifier,
222
+ 'table_title': table_title,
223
+ 'section': section,
224
+ 'sheet_name': sheet_name,
225
+ 'total_rows': len(normalized_rows),
226
+ 'chunk_size': len(content),
227
+ 'is_complete_table': True,
228
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
229
+ }
230
+
231
+ log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
232
+ return [Document(text=content, metadata=metadata)]
233
+
234
+ # Chunking logic continues...
235
+ chunks = []
236
+ current_rows = []
237
+ current_size = 0
238
+ chunk_num = 0
239
+
240
+ for i, row in enumerate(normalized_rows):
241
+ row_text = format_single_row(row, i + 1)
242
+ row_size = len(row_text)
243
+
244
+ should_split = (current_size + row_size > available_space or
245
+ len(current_rows) >= max_rows) and current_rows
246
+
247
+ if should_split:
248
+ content = base_content + format_table_rows(current_rows)
249
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
250
+ content += format_table_footer(table_identifier, doc_id)
251
+
252
+ metadata = {
253
+ 'type': 'table',
254
+ 'document_id': doc_id,
255
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
256
+ 'table_identifier': table_identifier,
257
+ 'table_title': table_title,
258
+ 'section': section,
259
+ 'sheet_name': sheet_name,
260
+ 'chunk_id': chunk_num,
261
+ 'row_start': current_rows[0]['_idx'] - 1,
262
+ 'row_end': current_rows[-1]['_idx'],
263
+ 'total_rows': len(normalized_rows),
264
+ 'chunk_size': len(content),
265
+ 'is_complete_table': False,
266
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
267
+ }
268
+
269
+ chunks.append(Document(text=content, metadata=metadata))
270
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
271
+
272
+ chunk_num += 1
273
+ current_rows = []
274
+ current_size = 0
275
+
276
+ row_copy = row.copy() if isinstance(row, dict) else {'data': row}
277
+ row_copy['_idx'] = i + 1
278
+ current_rows.append(row_copy)
279
+ current_size += row_size
280
+
281
+ # Final chunk
282
+ if current_rows:
283
+ content = base_content + format_table_rows(current_rows)
284
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
285
+ content += format_table_footer(table_identifier, doc_id)
286
+
287
+ metadata = {
288
+ 'type': 'table',
289
+ 'document_id': doc_id,
290
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
291
+ 'table_identifier': table_identifier,
292
+ 'table_title': table_title,
293
+ 'section': section,
294
+ 'sheet_name': sheet_name,
295
+ 'chunk_id': chunk_num,
296
+ 'row_start': current_rows[0]['_idx'] - 1,
297
+ 'row_end': current_rows[-1]['_idx'],
298
+ 'total_rows': len(normalized_rows),
299
+ 'chunk_size': len(content),
300
+ 'is_complete_table': False,
301
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
302
+ }
303
+
304
+ chunks.append(Document(text=content, metadata=metadata))
305
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
306
+
307
+ return chunks
308
+
309
+
310
+
311
+ def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
312
+ content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
313
+
314
+ # Add multiple searchable identifiers
315
+ if table_num and table_num not in ['-', 'unknown']:
316
+ content += f"НОМЕР ТАБЛИЦЫ: {normalize_text(table_num)}\n"
317
+
318
+ if sheet_name:
319
+ content += f"ЛИСТ: {sheet_name}\n"
320
+
321
+ if table_title:
322
+ content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
323
+
324
+ if section:
325
+ content += f"РАЗДЕЛ: {section}\n"
326
+
327
+ # ADD KEYWORDS for better retrieval
328
+ content += f"КЛЮЧЕВЫЕ СЛОВА: материалы стали марки стандарты {doc_id}\n"
329
+
330
+ content += f"{'='*70}\n"
331
+
332
+ if headers:
333
+ # Normalize headers too
334
+ normalized_headers = [normalize_text(str(h)) for h in headers]
335
+ header_str = ' | '.join(normalized_headers)
336
+ content += f"ЗАГОЛОВКИ: {header_str}\n\n"
337
+
338
+ content += "ДАННЫЕ:\n"
339
+ return content
340
+
341
+ def format_single_row(row, idx):
342
+ """Format a single row"""
343
+ if isinstance(row, dict):
344
+ parts = [f"{k}: {v}" for k, v in row.items()
345
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
346
+ if parts:
347
+ return f"{idx}. {' | '.join(parts)}\n"
348
+ elif isinstance(row, list):
349
+ parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
350
+ if parts:
351
+ return f"{idx}. {' | '.join(parts)}\n"
352
+ return ""
353
+
354
+
355
+ def format_table_rows(rows):
356
+ """Format multiple rows"""
357
+ content = ""
358
+ for row in rows:
359
+ idx = row.get('_idx', 0)
360
+ content += format_single_row(row, idx)
361
+ return content
362
+
363
+
364
+ def format_table_footer(table_identifier, doc_id):
365
+ """Format table footer"""
366
+ return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
367
+
368
+ def load_json_documents(repo_id, hf_token, json_dir):
369
+ import zipfile
370
+ import tempfile
371
+ import os
372
+
373
+ log_message("Loading JSON documents...")
374
+
375
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
376
+ json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
377
+ zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
378
+
379
+ log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
380
+
381
+ documents = []
382
+ stats = {'success': 0, 'failed': 0, 'empty': 0}
383
+
384
+ for file_path in json_files:
385
+ try:
386
+ log_message(f" Loading: {file_path}")
387
+ local_path = hf_hub_download(
388
+ repo_id=repo_id,
389
+ filename=file_path,
390
+ repo_type="dataset",
391
+ token=hf_token
392
+ )
393
+
394
+ docs = extract_sections_from_json(local_path)
395
+ if docs:
396
+ documents.extend(docs)
397
+ stats['success'] += 1
398
+ log_message(f" ✓ Extracted {len(docs)} sections")
399
+ else:
400
+ stats['empty'] += 1
401
+ log_message(f" ⚠ No sections found")
402
+
403
+ except Exception as e:
404
+ stats['failed'] += 1
405
+ log_message(f" ✗ Error: {e}")
406
+
407
+ for zip_path in zip_files:
408
+ try:
409
+ log_message(f" Processing ZIP: {zip_path}")
410
+ local_zip = hf_hub_download(
411
+ repo_id=repo_id,
412
+ filename=zip_path,
413
+ repo_type="dataset",
414
+ token=hf_token
415
+ )
416
+
417
+ with zipfile.ZipFile(local_zip, 'r') as zf:
418
+ json_files_in_zip = [f for f in zf.namelist()
419
+ if f.endswith('.json')
420
+ and not f.startswith('__MACOSX')
421
+ and not f.startswith('.')
422
+ and not '._' in f]
423
+
424
+ log_message(f" Found {len(json_files_in_zip)} JSON files in ZIP")
425
+
426
+ for json_file in json_files_in_zip:
427
+ try:
428
+ file_content = zf.read(json_file)
429
+
430
+ # Skip if file is too small
431
+ if len(file_content) < 10:
432
+ log_message(f" ✗ Skipping: {json_file} (file too small)")
433
+ stats['failed'] += 1
434
+ continue
435
+
436
+ # Try UTF-8 first (most common)
437
+ try:
438
+ text_content = file_content.decode('utf-8')
439
+ except UnicodeDecodeError:
440
+ try:
441
+ text_content = file_content.decode('utf-8-sig')
442
+ except UnicodeDecodeError:
443
+ try:
444
+ # Try UTF-16 (the issue you're seeing)
445
+ text_content = file_content.decode('utf-16')
446
+ except UnicodeDecodeError:
447
+ try:
448
+ text_content = file_content.decode('windows-1251')
449
+ except UnicodeDecodeError:
450
+ log_message(f" ✗ Skipping: {json_file} (encoding failed)")
451
+ stats['failed'] += 1
452
+ continue
453
+
454
+ # Validate JSON structure
455
+ if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
456
+ log_message(f" ✗ Skipping: {json_file} (not valid JSON)")
457
+ stats['failed'] += 1
458
+ continue
459
+
460
+ with tempfile.NamedTemporaryFile(mode='w', delete=False,
461
+ suffix='.json', encoding='utf-8') as tmp:
462
+ tmp.write(text_content)
463
+ tmp_path = tmp.name
464
+
465
+ docs = extract_sections_from_json(tmp_path)
466
+ if docs:
467
+ documents.extend(docs)
468
+ stats['success'] += 1
469
+ log_message(f" ✓ {json_file}: {len(docs)} sections")
470
+ else:
471
+ stats['empty'] += 1
472
+ log_message(f" ⚠ {json_file}: No sections")
473
+
474
+ os.unlink(tmp_path)
475
+
476
+ except json.JSONDecodeError as e:
477
+ stats['failed'] += 1
478
+ log_message(f" ✗ {json_file}: Invalid JSON")
479
+ except Exception as e:
480
+ stats['failed'] += 1
481
+ log_message(f" ✗ {json_file}: {str(e)[:100]}")
482
+
483
+ except Exception as e:
484
+ log_message(f" ✗ Error with ZIP: {e}")
485
+
486
+ log_message(f"="*60)
487
+ log_message(f"JSON Loading Stats:")
488
+ log_message(f" Success: {stats['success']}")
489
+ log_message(f" Empty: {stats['empty']}")
490
+ log_message(f" Failed: {stats['failed']}")
491
+ log_message(f" Total sections: {len(documents)}")
492
+ log_message(f"="*60)
493
+
494
+ return documents
495
+
496
+ def extract_sections_from_json(json_path):
497
+ """Extract sections from a single JSON file"""
498
+ documents = []
499
+
500
+ try:
501
+ with open(json_path, 'r', encoding='utf-8') as f:
502
+ data = json.load(f)
503
+
504
+ doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
505
+
506
+ # Extract all section levels
507
+ for section in data.get('sections', []):
508
+ if section.get('section_text', '').strip():
509
+ documents.append(Document(
510
+ text=section['section_text'],
511
+ metadata={
512
+ 'type': 'text',
513
+ 'document_id': doc_id,
514
+ 'section_id': section.get('section_id', '')
515
+ }
516
+ ))
517
+
518
+ # Subsections
519
+ for subsection in section.get('subsections', []):
520
+ if subsection.get('subsection_text', '').strip():
521
+ documents.append(Document(
522
+ text=subsection['subsection_text'],
523
+ metadata={
524
+ 'type': 'text',
525
+ 'document_id': doc_id,
526
+ 'section_id': subsection.get('subsection_id', '')
527
+ }
528
+ ))
529
+
530
+ # Sub-subsections
531
+ for sub_sub in subsection.get('sub_subsections', []):
532
+ if sub_sub.get('sub_subsection_text', '').strip():
533
+ documents.append(Document(
534
+ text=sub_sub['sub_subsection_text'],
535
+ metadata={
536
+ 'type': 'text',
537
+ 'document_id': doc_id,
538
+ 'section_id': sub_sub.get('sub_subsection_id', '')
539
+ }
540
+ ))
541
+
542
+ except Exception as e:
543
+ log_message(f"Error extracting from {json_path}: {e}")
544
+
545
+ return documents
546
+
547
+
548
+ def load_table_documents(repo_id, hf_token, table_dir):
549
+ log_message("Loading tables...")
550
+ log_message("="*60)
551
+ log_message("NORMALIZING STEEL DESIGNATIONS IN TABLES")
552
+
553
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
554
+ table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
555
+
556
+ all_chunks = []
557
+ tables_processed = 0
558
+
559
+ for file_path in table_files:
560
+ try:
561
+ local_path = hf_hub_download(
562
+ repo_id=repo_id,
563
+ filename=file_path,
564
+ repo_type="dataset",
565
+ token=hf_token
566
+ )
567
+
568
+ with open(local_path, 'r', encoding='utf-8') as f:
569
+ data = json.load(f)
570
+
571
+ file_doc_id = data.get('document_id', data.get('document', 'unknown'))
572
+
573
+ for sheet in data.get('sheets', []):
574
+ sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
575
+ tables_processed += 1
576
+
577
+ chunks = chunk_table_by_content(sheet, sheet_doc_id,
578
+ max_chars=MAX_CHARS_TABLE,
579
+ max_rows=MAX_ROWS_TABLE)
580
+ all_chunks.extend(chunks)
581
+
582
+ except Exception as e:
583
+ log_message(f"Error loading {file_path}: {e}")
584
+
585
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
586
+ log_message("="*60)
587
+
588
+ return all_chunks
589
+
590
+
591
+ def load_image_documents(repo_id, hf_token, image_dir):
592
+ """Load image descriptions"""
593
+ log_message("Loading images...")
594
+
595
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
596
+ csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
597
+
598
+ documents = []
599
+ for file_path in csv_files:
600
+ try:
601
+ local_path = hf_hub_download(
602
+ repo_id=repo_id,
603
+ filename=file_path,
604
+ repo_type="dataset",
605
+ token=hf_token
606
+ )
607
+
608
+ df = pd.read_csv(local_path)
609
+
610
+ for _, row in df.iterrows():
611
+ content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
612
+ content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
613
+ content += f"Название: {row.get('Название изображения', '')}\n"
614
+ content += f"Описание: {row.get('Описание изображение', '')}\n"
615
+ content += f"Раздел: {row.get('Раздел документа', '')}\n"
616
+
617
+ chunk_size = len(content)
618
+
619
+ documents.append(Document(
620
+ text=content,
621
+ metadata={
622
+ 'type': 'image',
623
+ 'document_id': str(row.get('Обозначение документа', 'unknown')),
624
+ 'image_number': str(row.get('№ Изображения', 'unknown')),
625
+ 'section': str(row.get('Раздел документа', '')),
626
+ 'chunk_size': chunk_size
627
+ }
628
+ ))
629
+ except Exception as e:
630
+ log_message(f"Error loading {file_path}: {e}")
631
+
632
+ if documents:
633
+ avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
634
+ log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
635
+
636
+ return documents
637
+
638
+
639
+ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
640
+ """Main loader - combines all document types"""
641
+ log_message("="*60)
642
+ log_message("STARTING DOCUMENT LOADING")
643
+ log_message("="*60)
644
+
645
+ # Load text sections
646
+ text_docs = load_json_documents(repo_id, hf_token, json_dir)
647
+ text_chunks = chunk_text_documents(text_docs)
648
+
649
+ # Load tables (already chunked)
650
+ table_chunks = load_table_documents(repo_id, hf_token, table_dir)
651
+
652
+ # Load images (no chunking needed)
653
+ image_docs = load_image_documents(repo_id, hf_token, image_dir)
654
+
655
+ all_docs = text_chunks + table_chunks + image_docs
656
+
657
+ log_message("="*60)
658
+ log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
659
+ log_message(f" Text chunks: {len(text_chunks)}")
660
+ log_message(f" Table chunks: {len(table_chunks)}")
661
+ log_message(f" Images: {len(image_docs)}")
662
+ log_message("="*60)
663
+
664
  return all_docs