MrSimple07 commited on
Commit
d1e7fd2
·
1 Parent(s): 9da507d

new documents prep

Browse files
Files changed (3) hide show
  1. documents_prep.py +501 -439
  2. table_prep.py +9 -7
  3. utils.py +0 -93
documents_prep.py CHANGED
@@ -3,486 +3,548 @@ import zipfile
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
6
- from my_logging import log_message
7
  from llama_index.core.text_splitter import SentenceSplitter
8
- from config import CHUNK_SIZE, CHUNK_OVERLAP
9
- from table_prep import table_to_document, load_table_data
10
 
 
 
 
11
 
12
- def chunk_document(doc, chunk_size=None, chunk_overlap=None):
13
- if chunk_size is None:
14
- chunk_size = CHUNK_SIZE
15
- if chunk_overlap is None:
16
- chunk_overlap = CHUNK_OVERLAP
17
  text_splitter = SentenceSplitter(
18
- chunk_size=chunk_size,
19
- chunk_overlap=chunk_overlap,
20
- separator=" "
21
  )
22
 
23
- text_chunks = text_splitter.split_text(doc.text)
24
-
25
- chunked_docs = []
26
- for i, chunk_text in enumerate(text_chunks):
27
- chunk_metadata = doc.metadata.copy()
28
- chunk_metadata.update({
29
- "chunk_id": i,
30
- "total_chunks": len(text_chunks),
31
- "chunk_size": len(chunk_text),
32
- "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
33
- })
34
-
35
- chunked_doc = Document(
36
- text=chunk_text,
37
- metadata=chunk_metadata
38
- )
39
- chunked_docs.append(chunked_doc)
 
40
 
41
- return chunked_docs
42
 
43
- def process_documents_with_chunking(documents):
44
- all_chunked_docs = []
45
- chunk_info = []
46
- table_count = 0
47
- table_chunks_count = 0
48
- image_count = 0
49
- image_chunks_count = 0
50
- text_chunks_count = 0
 
 
51
 
52
- for doc in documents:
53
- doc_type = doc.metadata.get('type', 'text')
54
- is_already_chunked = doc.metadata.get('is_chunked', False)
55
-
56
- if doc_type == 'table':
57
- if is_already_chunked:
58
- table_chunks_count += 1
59
- all_chunked_docs.append(doc)
60
- chunk_info.append({
61
- 'document_id': doc.metadata.get('document_id', 'unknown'),
62
- 'section_id': doc.metadata.get('section_id', 'unknown'),
63
- 'chunk_id': doc.metadata.get('chunk_id', 0),
64
- 'total_chunks': doc.metadata.get('total_chunks', 1),
65
- 'chunk_size': len(doc.text),
66
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
67
- 'type': 'table',
68
- 'table_number': doc.metadata.get('table_number', 'unknown')
69
- })
70
- else:
71
- table_count += 1
72
- all_chunked_docs.append(doc)
73
- chunk_info.append({
74
- 'document_id': doc.metadata.get('document_id', 'unknown'),
75
- 'section_id': doc.metadata.get('section_id', 'unknown'),
76
- 'chunk_id': 0,
77
- 'chunk_size': len(doc.text),
78
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
79
- 'type': 'table',
80
- 'table_number': doc.metadata.get('table_number', 'unknown')
81
- })
82
-
83
- elif doc_type == 'image':
84
- image_count += 1
85
- doc_size = len(doc.text)
86
- if doc_size > CHUNK_SIZE:
87
- log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
88
- f"Размер: {doc_size} > {CHUNK_SIZE}")
89
- chunked_docs = chunk_document(doc)
90
- image_chunks_count += len(chunked_docs)
91
- all_chunked_docs.extend(chunked_docs)
92
- log_message(f" ✂️ Разделено на {len(chunked_docs)} чанк��в")
93
-
94
- for i, chunk_doc in enumerate(chunked_docs):
95
- chunk_info.append({
96
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
97
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
98
- 'chunk_id': i,
99
- 'chunk_size': len(chunk_doc.text),
100
- 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
101
- 'type': 'image',
102
- 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
103
- })
104
- else:
105
- all_chunked_docs.append(doc)
106
- chunk_info.append({
107
- 'document_id': doc.metadata.get('document_id', 'unknown'),
108
- 'section_id': doc.metadata.get('section_id', 'unknown'),
109
- 'chunk_id': 0,
110
- 'chunk_size': doc_size,
111
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
112
- 'type': 'image',
113
- 'image_number': doc.metadata.get('image_number', 'unknown')
114
- })
115
-
116
  else:
117
- doc_size = len(doc.text)
118
- if doc_size > CHUNK_SIZE:
119
- log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
120
- f"Размер: {doc_size} > {CHUNK_SIZE}")
121
- chunked_docs = chunk_document(doc)
122
- text_chunks_count += len(chunked_docs)
123
- all_chunked_docs.extend(chunked_docs)
124
- log_message(f" ✂️ Разделен на {len(chunked_docs)} чанков")
125
-
126
- for i, chunk_doc in enumerate(chunked_docs):
127
- chunk_info.append({
128
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
129
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
130
- 'chunk_id': i,
131
- 'chunk_size': len(chunk_doc.text),
132
- 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
133
- 'type': 'text'
134
- })
135
- else:
136
- all_chunked_docs.append(doc)
137
- chunk_info.append({
138
- 'document_id': doc.metadata.get('document_id', 'unknown'),
139
- 'section_id': doc.metadata.get('section_id', 'unknown'),
140
- 'chunk_id': 0,
141
- 'chunk_size': doc_size,
142
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
143
- 'type': 'text'
144
- })
145
-
146
- log_message(f"\n{'='*60}")
147
- log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
148
- log_message(f" • Таблицы (целые): {table_count}")
149
- log_message(f" • Таблицы (чанки): {table_chunks_count}")
150
- log_message(f" • Изображения (целые): {image_count - (image_chunks_count > 0)}")
151
- log_message(f" • Изображения (чанки): {image_chunks_count}")
152
- log_message(f" • Текстовые чанки: {text_chunks_count}")
153
- log_message(f" • Всего документов: {len(all_chunked_docs)}")
154
- log_message(f"{'='*60}\n")
155
-
156
- return all_chunked_docs, chunk_info
157
-
158
- def extract_text_from_json(data, document_id, document_name):
159
- documents = []
160
 
161
- if 'sections' in data:
162
- for section in data['sections']:
163
- section_id = section.get('section_id', 'Unknown')
164
- section_text = section.get('section_text', '')
165
-
166
- section_path = f"{section_id}"
167
- section_title = extract_section_title(section_text)
168
-
169
- if section_text.strip():
170
- doc = Document(
171
- text=section_text,
172
- metadata={
173
- "type": "text",
174
- "document_id": document_id,
175
- "document_name": document_name,
176
- "section_id": section_id,
177
- "section_text": section_title[:200],
178
- "section_path": section_path,
179
- "level": "section"
180
- }
181
- )
182
- documents.append(doc)
183
-
184
- if 'subsections' in section:
185
- for subsection in section['subsections']:
186
- subsection_id = subsection.get('subsection_id', 'Unknown')
187
- subsection_text = subsection.get('subsection_text', '')
188
- subsection_title = extract_section_title(subsection_text)
189
- subsection_path = f"{section_path}.{subsection_id}"
190
-
191
- if subsection_text.strip():
192
- doc = Document(
193
- text=subsection_text,
194
- metadata={
195
- "type": "text",
196
- "document_id": document_id,
197
- "document_name": document_name,
198
- "section_id": subsection_id,
199
- "section_text": subsection_title[:200],
200
- "section_path": subsection_path,
201
- "level": "subsection",
202
- "parent_section": section_id,
203
- "parent_title": section_title[:100]
204
- }
205
- )
206
- documents.append(doc)
207
-
208
- if 'sub_subsections' in subsection:
209
- for sub_subsection in subsection['sub_subsections']:
210
- sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
211
- sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
212
- sub_subsection_title = extract_section_title(sub_subsection_text)
213
- sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
214
-
215
- if sub_subsection_text.strip():
216
- doc = Document(
217
- text=sub_subsection_text,
218
- metadata={
219
- "type": "text",
220
- "document_id": document_id,
221
- "document_name": document_name,
222
- "section_id": sub_subsection_id,
223
- "section_text": sub_subsection_title[:200],
224
- "section_path": sub_subsection_path,
225
- "level": "sub_subsection",
226
- "parent_section": subsection_id,
227
- "parent_title": subsection_title[:100]
228
- }
229
- )
230
- documents.append(doc)
231
-
232
- if 'sub_sub_subsections' in sub_subsection:
233
- for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
234
- sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
235
- sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
236
- sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
237
-
238
- if sub_sub_subsection_text.strip():
239
- doc = Document(
240
- text=sub_sub_subsection_text,
241
- metadata={
242
- "type": "text",
243
- "document_id": document_id,
244
- "document_name": document_name,
245
- "section_id": sub_sub_subsection_id,
246
- "section_text": sub_sub_subsection_title[:200],
247
- "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
248
- "level": "sub_sub_subsection",
249
- "parent_section": sub_subsection_id,
250
- "parent_title": sub_subsection_title[:100]
251
- }
252
- )
253
- documents.append(doc)
254
 
255
- return documents
256
-
257
- def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
258
- log_message("Начинаю загрузку JSON документов")
259
 
260
- try:
261
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
262
- zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
263
- json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
264
-
265
- log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
 
 
 
266
 
267
- all_documents = []
 
 
 
 
 
 
 
 
 
 
268
 
269
- for zip_file_path in zip_files:
270
- try:
271
- log_message(f"Загружаю ZIP архив: {zip_file_path}")
272
- local_zip_path = hf_hub_download(
273
- repo_id=repo_id,
274
- filename=zip_file_path,
275
- local_dir=download_dir,
276
- repo_type="dataset",
277
- token=hf_token
278
- )
279
-
280
- documents = extract_zip_and_process_json(local_zip_path)
281
- all_documents.extend(documents)
282
- log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
283
-
284
- except Exception as e:
285
- log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
286
- continue
287
-
288
- for file_path in json_files:
289
- try:
290
- log_message(f"Обрабатываю прямой JSON файл: {file_path}")
291
- local_path = hf_hub_download(
292
- repo_id=repo_id,
293
- filename=file_path,
294
- local_dir=download_dir,
295
- repo_type="dataset",
296
- token=hf_token
297
- )
298
-
299
- with open(local_path, 'r', encoding='utf-8') as f:
300
- json_data = json.load(f)
301
-
302
- document_metadata = json_data.get('document_metadata', {})
303
- document_id = document_metadata.get('document_id', 'unknown')
304
- document_name = document_metadata.get('document_name', 'unknown')
305
-
306
- documents = extract_text_from_json(json_data, document_id, document_name)
307
- all_documents.extend(documents)
308
-
309
- log_message(f"Извлечено {len(documents)} документов из {file_path}")
310
-
311
- except Exception as e:
312
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
313
- continue
314
-
315
- log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
316
 
317
- # Process documents through chunking function
318
- chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
- log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
 
 
 
 
 
 
 
 
 
 
321
 
322
- return chunked_documents, chunk_info
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
- except Exception as e:
325
- log_message(f"Ошибка загрузки JSON документов: {str(e)}")
326
- return [], []
 
327
 
328
- def extract_section_title(section_text):
329
- if not section_text.strip():
330
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- lines = section_text.strip().split('\n')
333
- first_line = lines[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
- if len(first_line) < 200 and not first_line.endswith('.'):
336
- return first_line
337
 
338
- # Otherwise, extract first sentence
339
- sentences = first_line.split('.')
340
- if len(sentences) > 1:
341
- return sentences[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
- return first_line[:100] + "..." if len(first_line) > 100 else first_line
 
 
344
 
345
- def extract_zip_and_process_json(zip_path):
 
 
 
 
 
 
 
 
 
 
 
 
346
  documents = []
 
347
 
348
- try:
349
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
350
- zip_files = zip_ref.namelist()
351
- json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
 
 
 
 
 
352
 
353
- log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
 
 
 
 
 
 
 
354
 
355
- for json_file in json_files:
356
- try:
357
- log_message(f"Обрабатываю файл из архива: {json_file}")
358
-
359
- with zip_ref.open(json_file) as f:
360
- json_data = json.load(f)
361
-
362
- document_metadata = json_data.get('document_metadata', {})
363
- document_id = document_metadata.get('document_id', 'unknown')
364
- document_name = document_metadata.get('document_name', 'unknown')
365
-
366
- docs = extract_text_from_json(json_data, document_id, document_name)
367
- documents.extend(docs)
368
-
369
- log_message(f"Извлечено {len(docs)} документов из {json_file}")
370
-
371
- except Exception as e:
372
- log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
373
- continue
374
 
375
- except Exception as e:
376
- log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
  return documents
379
 
380
- def load_image_data(repo_id, hf_token, image_data_dir):
381
- log_message("Начинаю загрузку данных изображений")
 
382
 
383
- image_files = []
384
  try:
385
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
386
- for file in files:
387
- if file.startswith(image_data_dir) and file.endswith('.csv'):
388
- image_files.append(file)
389
 
390
- log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
391
 
392
- image_documents = []
393
- for file_path in image_files:
394
- try:
395
- log_message(f"Обрабатываю файл изображений: {file_path}")
396
- local_path = hf_hub_download(
397
- repo_id=repo_id,
398
- filename=file_path,
399
- local_dir='',
400
- repo_type="dataset",
401
- token=hf_token
402
- )
403
-
404
- df = pd.read_csv(local_path)
405
- log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
406
-
407
- # Обработка с правильными названиями колонок
408
- for _, row in df.iterrows():
409
- section_value = row.get('Раздел документа', 'Неизвестно')
410
-
411
- content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
412
- content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
413
- content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n" # Опечатка в назва��ии колонки
414
- content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
415
- content += f"Раздел: {section_value}\n"
416
- content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
417
-
418
- doc = Document(
419
- text=content,
420
  metadata={
421
- "type": "image",
422
- "image_number": str(row.get('№ Изображения', 'unknown')),
423
- "image_title": str(row.get('Название изображения', 'unknown')),
424
- "image_description": str(row.get('Описание изображение', 'unknown')),
425
- "document_id": str(row.get('Обозначение документа', 'unknown')),
426
- "file_path": str(row.get('Файл изображения', 'unknown')),
427
- "section": str(section_value),
428
- "section_id": str(section_value)
429
  }
430
- )
431
- image_documents.append(doc)
432
-
433
- except Exception as e:
434
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
435
- continue
436
-
437
- log_message(f"Создано {len(image_documents)} документов из изображений")
438
- return image_documents
439
-
 
 
 
 
440
  except Exception as e:
441
- log_message(f"Ошибка загрузки данных изображений: {str(e)}")
442
- return []
 
443
 
444
 
445
- def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
446
- log_message("Загружаю данные чанков из CSV")
 
447
 
448
- try:
449
- chunks_csv_path = hf_hub_download(
450
- repo_id=repo_id,
451
- filename=chunks_filename,
452
- local_dir=download_dir,
453
- repo_type="dataset",
454
- token=hf_token
455
- )
456
-
457
- chunks_df = pd.read_csv(chunks_csv_path)
458
- log_message(f"Загружено {len(chunks_df)} чанков из CSV")
459
-
460
- text_column = None
461
- for col in chunks_df.columns:
462
- if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
463
- text_column = col
464
- break
465
-
466
- if text_column is None:
467
- text_column = chunks_df.columns[0]
468
-
469
- log_message(f"Использую колонку: {text_column}")
470
-
471
- documents = []
472
- for i, (_, row) in enumerate(chunks_df.iterrows()):
473
- doc = Document(
474
- text=str(row[text_column]),
475
- metadata={
476
- "chunk_id": row.get('chunk_id', i),
477
- "document_id": row.get('document_id', 'unknown'),
478
- "type": "text"
479
- }
480
  )
481
- documents.append(doc)
482
-
483
- log_message(f"Создано {len(documents)} текстовых документов из CSV")
484
- return documents, chunks_df
485
-
486
- except Exception as e:
487
- log_message(f"Ошибка загрузки CSV данных: {str(e)}")
488
- return [], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
 
6
  from llama_index.core.text_splitter import SentenceSplitter
7
+ from my_logging import log_message
 
8
 
9
+ # Configuration
10
+ CHUNK_SIZE = 512
11
+ CHUNK_OVERLAP = 128
12
 
13
+ def chunk_text_documents(documents):
 
 
 
 
14
  text_splitter = SentenceSplitter(
15
+ chunk_size=CHUNK_SIZE,
16
+ chunk_overlap=CHUNK_OVERLAP
 
17
  )
18
 
19
+ chunked = []
20
+ for doc in documents:
21
+ chunks = text_splitter.get_nodes_from_documents([doc])
22
+ for i, chunk in enumerate(chunks):
23
+ chunk.metadata.update({
24
+ 'chunk_id': i,
25
+ 'total_chunks': len(chunks),
26
+ 'chunk_size': len(chunk.text) # Add chunk size
27
+ })
28
+ chunked.append(chunk)
29
+
30
+ # Log statistics
31
+ if chunked:
32
+ avg_size = sum(len(c.text) for c in chunked) / len(chunked)
33
+ min_size = min(len(c.text) for c in chunked)
34
+ max_size = max(len(c.text) for c in chunked)
35
+ log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
36
+ log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
37
 
38
+ return chunked
39
 
40
+
41
+ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
42
+ """Chunk tables by content size instead of rows"""
43
+ headers = table_data.get('headers', [])
44
+ rows = table_data.get('data', [])
45
+ table_num = table_data.get('table_number', 'unknown')
46
+ table_title = table_data.get('table_title', '')
47
+ section = table_data.get('section', '')
48
+
49
+ table_num_clean = str(table_num).strip()
50
 
51
+ # Create section-aware identifier
52
+ import re
53
+ if 'приложени' in section.lower():
54
+ appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
55
+ if appendix_match:
56
+ appendix_num = appendix_match.group(1).upper()
57
+ table_identifier = f"{table_num_clean} Приложение {appendix_num}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  else:
59
+ table_identifier = table_num_clean
60
+ else:
61
+ table_identifier = table_num_clean
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ if not rows:
64
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
 
 
 
67
 
68
+ # Calculate base metadata size (everything except row data)
69
+ base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
70
+ base_size = len(base_content)
71
+ available_space = max_chars - base_size - 200
72
+
73
+ # If entire table fits, return as one chunk
74
+ full_rows_content = format_table_rows(rows)
75
+ if base_size + len(full_rows_content) <= max_chars:
76
+ content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
77
 
78
+ metadata = {
79
+ 'type': 'table',
80
+ 'document_id': doc_id,
81
+ 'table_number': table_num_clean,
82
+ 'table_identifier': table_identifier,
83
+ 'table_title': table_title,
84
+ 'section': section,
85
+ 'total_rows': len(rows),
86
+ 'chunk_size': len(content),
87
+ 'is_complete_table': True
88
+ }
89
 
90
+ log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
91
+ return [Document(text=content, metadata=metadata)]
92
+
93
+ # Otherwise, chunk by content size
94
+ chunks = []
95
+ current_rows = []
96
+ current_size = 0
97
+ chunk_num = 0
98
+
99
+ for i, row in enumerate(rows):
100
+ row_text = format_single_row(row, i + 1)
101
+ row_size = len(row_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ # If adding this row exceeds limit, save current chunk
104
+ if current_size + row_size > available_space and current_rows:
105
+ content = base_content + format_table_rows(current_rows)
106
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
107
+ content += format_table_footer(table_identifier, doc_id)
108
+
109
+ metadata = {
110
+ 'type': 'table',
111
+ 'document_id': doc_id,
112
+ 'table_number': table_num_clean,
113
+ 'table_identifier': table_identifier,
114
+ 'table_title': table_title,
115
+ 'section': section,
116
+ 'chunk_id': chunk_num,
117
+ 'row_start': current_rows[0]['_idx'] - 1,
118
+ 'row_end': current_rows[-1]['_idx'],
119
+ 'total_rows': len(rows),
120
+ 'chunk_size': len(content),
121
+ 'is_complete_table': False
122
+ }
123
+
124
+ chunks.append(Document(text=content, metadata=metadata))
125
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
126
+
127
+ chunk_num += 1
128
+ current_rows = []
129
+ current_size = 0
130
 
131
+ # Add row index for tracking
132
+ row_copy = row.copy() if isinstance(row, dict) else {'data': row}
133
+ row_copy['_idx'] = i + 1
134
+ current_rows.append(row_copy)
135
+ current_size += row_size
136
+
137
+ # Add final chunk if rows remain
138
+ if current_rows:
139
+ content = base_content + format_table_rows(current_rows)
140
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
141
+ content += format_table_footer(table_identifier, doc_id)
142
 
143
+ metadata = {
144
+ 'type': 'table',
145
+ 'document_id': doc_id,
146
+ 'table_number': table_num_clean,
147
+ 'table_identifier': table_identifier,
148
+ 'table_title': table_title,
149
+ 'section': section,
150
+ 'chunk_id': chunk_num,
151
+ 'row_start': current_rows[0]['_idx'] - 1,
152
+ 'row_end': current_rows[-1]['_idx'],
153
+ 'total_rows': len(rows),
154
+ 'chunk_size': len(content),
155
+ 'is_complete_table': False
156
+ }
157
 
158
+ chunks.append(Document(text=content, metadata=metadata))
159
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
160
+
161
+ return chunks
162
 
163
+
164
+ def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
165
+ """Format consistent table header"""
166
+ content = f"ДОКУМЕНТ: {doc_id}\n"
167
+ content += f"ТАБЛИЦА: {table_identifier}\n"
168
+ content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
169
+ content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
170
+ if table_title:
171
+ content += f"НАЗВАНИЕ: {table_title}\n"
172
+ if section:
173
+ content += f"РАЗДЕЛ: {section}\n"
174
+ content += f"{'='*70}\n\n"
175
+
176
+ # Enhanced search keywords
177
+ content += f"Это таблица {table_identifier} из документа {doc_id}. "
178
+ content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
179
+
180
+ if section:
181
+ content += f"Раздел: {section}. "
182
+ if 'приложени' in section.lower():
183
+ content += f"Таблица из приложения. "
184
+
185
+ if table_title:
186
+ content += f"Название: {table_title}. "
187
+
188
+ content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
189
+
190
+ if headers:
191
+ header_str = ' | '.join(str(h) for h in headers)
192
+ content += f"ЗАГОЛОВКИ: {header_str}\n\n"
193
 
194
+ content += "ДАННЫЕ:\n"
195
+ return content
196
+
197
+
198
+ def format_single_row(row, idx):
199
+ """Format a single row"""
200
+ if isinstance(row, dict):
201
+ parts = [f"{k}: {v}" for k, v in row.items()
202
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
203
+ if parts:
204
+ return f"{idx}. {' | '.join(parts)}\n"
205
+ elif isinstance(row, list):
206
+ parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
207
+ if parts:
208
+ return f"{idx}. {' | '.join(parts)}\n"
209
+ return ""
210
+
211
+
212
+ def format_table_rows(rows):
213
+ """Format multiple rows"""
214
+ content = ""
215
+ for row in rows:
216
+ idx = row.get('_idx', 0)
217
+ content += format_single_row(row, idx)
218
+ return content
219
+
220
+
221
+ def format_table_footer(table_identifier, doc_id):
222
+ """Format table footer"""
223
+ return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
224
+
225
+ def load_table_documents(repo_id, hf_token, table_dir):
226
+ log_message("Loading tables...")
227
 
228
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
229
+ table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
230
 
231
+ all_chunks = []
232
+ for file_path in table_files:
233
+ try:
234
+ local_path = hf_hub_download(
235
+ repo_id=repo_id,
236
+ filename=file_path,
237
+ repo_type="dataset",
238
+ token=hf_token
239
+ )
240
+
241
+ with open(local_path, 'r', encoding='utf-8') as f:
242
+ data = json.load(f)
243
+
244
+ file_doc_id = data.get('document_id', data.get('document', 'unknown'))
245
+
246
+ for sheet in data.get('sheets', []):
247
+ sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
248
+
249
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
250
+ all_chunks.extend(chunks)
251
+
252
+ except Exception as e:
253
+ log_message(f"Error loading {file_path}: {e}")
254
 
255
+ log_message(f" Loaded {len(all_chunks)} table chunks")
256
+ return all_chunks
257
+
258
 
259
+ def load_json_documents(repo_id, hf_token, json_dir):
260
+ import zipfile
261
+ import tempfile
262
+ import os
263
+
264
+ log_message("Loading JSON documents...")
265
+
266
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
267
+ json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
268
+ zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
269
+
270
+ log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
271
+
272
  documents = []
273
+ stats = {'success': 0, 'failed': 0, 'empty': 0}
274
 
275
+ for file_path in json_files:
276
+ try:
277
+ log_message(f" Loading: {file_path}")
278
+ local_path = hf_hub_download(
279
+ repo_id=repo_id,
280
+ filename=file_path,
281
+ repo_type="dataset",
282
+ token=hf_token
283
+ )
284
 
285
+ docs = extract_sections_from_json(local_path)
286
+ if docs:
287
+ documents.extend(docs)
288
+ stats['success'] += 1
289
+ log_message(f" ✓ Extracted {len(docs)} sections")
290
+ else:
291
+ stats['empty'] += 1
292
+ log_message(f" ⚠ No sections found")
293
 
294
+ except Exception as e:
295
+ stats['failed'] += 1
296
+ log_message(f" Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
+ for zip_path in zip_files:
299
+ try:
300
+ log_message(f" Processing ZIP: {zip_path}")
301
+ local_zip = hf_hub_download(
302
+ repo_id=repo_id,
303
+ filename=zip_path,
304
+ repo_type="dataset",
305
+ token=hf_token
306
+ )
307
+
308
+ with zipfile.ZipFile(local_zip, 'r') as zf:
309
+ json_files_in_zip = [f for f in zf.namelist()
310
+ if f.endswith('.json')
311
+ and not f.startswith('__MACOSX')
312
+ and not f.startswith('.')
313
+ and not '._' in f]
314
+
315
+ log_message(f" Found {len(json_files_in_zip)} JSON files in ZIP")
316
+
317
+ for json_file in json_files_in_zip:
318
+ try:
319
+ file_content = zf.read(json_file)
320
+
321
+ # Skip if file is too small
322
+ if len(file_content) < 10:
323
+ log_message(f" ✗ Skipping: {json_file} (file too small)")
324
+ stats['failed'] += 1
325
+ continue
326
+
327
+ # Try UTF-8 first (most common)
328
+ try:
329
+ text_content = file_content.decode('utf-8')
330
+ except UnicodeDecodeError:
331
+ try:
332
+ text_content = file_content.decode('utf-8-sig')
333
+ except UnicodeDecodeError:
334
+ try:
335
+ # Try UTF-16 (the issue you're seeing)
336
+ text_content = file_content.decode('utf-16')
337
+ except UnicodeDecodeError:
338
+ try:
339
+ text_content = file_content.decode('windows-1251')
340
+ except UnicodeDecodeError:
341
+ log_message(f" ✗ Skipping: {json_file} (encoding failed)")
342
+ stats['failed'] += 1
343
+ continue
344
+
345
+ # Validate JSON structure
346
+ if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
347
+ log_message(f" ✗ Skipping: {json_file} (not valid JSON)")
348
+ stats['failed'] += 1
349
+ continue
350
+
351
+ with tempfile.NamedTemporaryFile(mode='w', delete=False,
352
+ suffix='.json', encoding='utf-8') as tmp:
353
+ tmp.write(text_content)
354
+ tmp_path = tmp.name
355
+
356
+ docs = extract_sections_from_json(tmp_path)
357
+ if docs:
358
+ documents.extend(docs)
359
+ stats['success'] += 1
360
+ log_message(f" ✓ {json_file}: {len(docs)} sections")
361
+ else:
362
+ stats['empty'] += 1
363
+ log_message(f" ⚠ {json_file}: No sections")
364
+
365
+ os.unlink(tmp_path)
366
+
367
+ except json.JSONDecodeError as e:
368
+ stats['failed'] += 1
369
+ log_message(f" ✗ {json_file}: Invalid JSON")
370
+ except Exception as e:
371
+ stats['failed'] += 1
372
+ log_message(f" ✗ {json_file}: {str(e)[:100]}")
373
+
374
+ except Exception as e:
375
+ log_message(f" ✗ Error with ZIP: {e}")
376
+
377
+ log_message(f"="*60)
378
+ log_message(f"JSON Loading Stats:")
379
+ log_message(f" Success: {stats['success']}")
380
+ log_message(f" Empty: {stats['empty']}")
381
+ log_message(f" Failed: {stats['failed']}")
382
+ log_message(f" Total sections: {len(documents)}")
383
+ log_message(f"="*60)
384
 
385
  return documents
386
 
387
+ def extract_sections_from_json(json_path):
388
+ """Extract sections from a single JSON file"""
389
+ documents = []
390
 
 
391
  try:
392
+ with open(json_path, 'r', encoding='utf-8') as f:
393
+ data = json.load(f)
 
 
394
 
395
+ doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
396
 
397
+ # Extract all section levels
398
+ for section in data.get('sections', []):
399
+ if section.get('section_text', '').strip():
400
+ documents.append(Document(
401
+ text=section['section_text'],
402
+ metadata={
403
+ 'type': 'text',
404
+ 'document_id': doc_id,
405
+ 'section_id': section.get('section_id', '')
406
+ }
407
+ ))
408
+
409
+ # Subsections
410
+ for subsection in section.get('subsections', []):
411
+ if subsection.get('subsection_text', '').strip():
412
+ documents.append(Document(
413
+ text=subsection['subsection_text'],
 
 
 
 
 
 
 
 
 
 
 
414
  metadata={
415
+ 'type': 'text',
416
+ 'document_id': doc_id,
417
+ 'section_id': subsection.get('subsection_id', '')
 
 
 
 
 
418
  }
419
+ ))
420
+
421
+ # Sub-subsections
422
+ for sub_sub in subsection.get('sub_subsections', []):
423
+ if sub_sub.get('sub_subsection_text', '').strip():
424
+ documents.append(Document(
425
+ text=sub_sub['sub_subsection_text'],
426
+ metadata={
427
+ 'type': 'text',
428
+ 'document_id': doc_id,
429
+ 'section_id': sub_sub.get('sub_subsection_id', '')
430
+ }
431
+ ))
432
+
433
  except Exception as e:
434
+ log_message(f"Error extracting from {json_path}: {e}")
435
+
436
+ return documents
437
 
438
 
439
+ def load_table_documents(repo_id, hf_token, table_dir):
440
+ """Load and chunk tables"""
441
+ log_message("Loading tables...")
442
 
443
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
444
+ table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
445
+
446
+ all_chunks = []
447
+ for file_path in table_files:
448
+ try:
449
+ local_path = hf_hub_download(
450
+ repo_id=repo_id,
451
+ filename=file_path,
452
+ repo_type="dataset",
453
+ token=hf_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  )
455
+
456
+ with open(local_path, 'r', encoding='utf-8') as f:
457
+ data = json.load(f)
458
+
459
+ # Extract file-level document_id
460
+ file_doc_id = data.get('document_id', data.get('document', 'unknown'))
461
+
462
+ for sheet in data.get('sheets', []):
463
+ # Use sheet-level document_id if available, otherwise use file-level
464
+ sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
465
+
466
+ # CRITICAL: Pass document_id to chunk function
467
+ chunks = chunk_table_by_content(sheet, sheet_doc_id)
468
+ all_chunks.extend(chunks)
469
+
470
+ except Exception as e:
471
+ log_message(f"Error loading {file_path}: {e}")
472
+
473
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks")
474
+ return all_chunks
475
+
476
+
477
+ def load_image_documents(repo_id, hf_token, image_dir):
478
+ """Load image descriptions"""
479
+ log_message("Loading images...")
480
+
481
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
482
+ csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
483
+
484
+ documents = []
485
+ for file_path in csv_files:
486
+ try:
487
+ local_path = hf_hub_download(
488
+ repo_id=repo_id,
489
+ filename=file_path,
490
+ repo_type="dataset",
491
+ token=hf_token
492
+ )
493
+
494
+ df = pd.read_csv(local_path)
495
+
496
+ for _, row in df.iterrows():
497
+ content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
498
+ content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
499
+ content += f"Название: {row.get('Название изображения', '')}\n"
500
+ content += f"Описание: {row.get('Описание изображение', '')}\n"
501
+ content += f"Раздел: {row.get('Раздел документа', '')}\n"
502
+
503
+ chunk_size = len(content)
504
+
505
+ documents.append(Document(
506
+ text=content,
507
+ metadata={
508
+ 'type': 'image',
509
+ 'document_id': str(row.get('Обозначение документа', 'unknown')),
510
+ 'image_number': str(row.get('№ Изображения', 'unknown')),
511
+ 'section': str(row.get('Раздел документа', '')),
512
+ 'chunk_size': chunk_size
513
+ }
514
+ ))
515
+ except Exception as e:
516
+ log_message(f"Error loading {file_path}: {e}")
517
+
518
+ if documents:
519
+ avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
520
+ log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
521
+
522
+ return documents
523
+
524
+
525
+ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
526
+ """Main loader - combines all document types"""
527
+ log_message("="*60)
528
+ log_message("STARTING DOCUMENT LOADING")
529
+ log_message("="*60)
530
+
531
+ # Load text sections
532
+ text_docs = load_json_documents(repo_id, hf_token, json_dir)
533
+ text_chunks = chunk_text_documents(text_docs)
534
+
535
+ # Load tables (already chunked)
536
+ table_chunks = load_table_documents(repo_id, hf_token, table_dir)
537
+
538
+ # Load images (no chunking needed)
539
+ image_docs = load_image_documents(repo_id, hf_token, image_dir)
540
+
541
+ all_docs = text_chunks + table_chunks + image_docs
542
+
543
+ log_message("="*60)
544
+ log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
545
+ log_message(f" Text chunks: {len(text_chunks)}")
546
+ log_message(f" Table chunks: {len(table_chunks)}")
547
+ log_message(f" Images: {len(image_docs)}")
548
+ log_message("="*60)
549
+
550
+ return all_docs
table_prep.py CHANGED
@@ -32,8 +32,7 @@ def create_table_content(table_data):
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
- def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
36
- """Simple table chunking: max 5 rows or 2000 chars per chunk"""
37
 
38
  lines = doc.text.strip().split('\n')
39
 
@@ -63,18 +62,17 @@ def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
63
  current_size = len(header)
64
 
65
  for row in data_rows:
66
- row_size = len(row) + 1 # +1 for newline
67
-
68
- # Check if we need to create a new chunk
69
  if (len(current_rows) >= max_rows_per_chunk or
70
  current_size + row_size > max_chunk_size) and current_rows:
71
 
72
  # Save current chunk
73
  chunk_text = header + '\n'.join(current_rows)
74
  chunks.append(chunk_text)
75
-
76
- # Start new chunk (keep last row for overlap)
77
  current_rows = [current_rows[-1]]
 
78
  current_size = len(header) + len(current_rows[0]) + 1
79
 
80
  current_rows.append(row)
@@ -147,7 +145,10 @@ def table_to_document(table_data, document_id=None):
147
  }
148
  )
149
  if len(content) > 2000:
 
 
150
  return chunk_table_document(base_doc)
 
151
 
152
  return [base_doc]
153
 
@@ -205,6 +206,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
205
  stats['total_size'] += size
206
  stats['by_document'][document_id]['count'] += 1
207
  stats['by_document'][document_id]['size'] += size
 
208
  else:
209
  docs_list = table_to_document(table_data, document_id)
210
  table_documents.extend(docs_list)
 
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
+ def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
 
36
 
37
  lines = doc.text.strip().split('\n')
38
 
 
62
  current_size = len(header)
63
 
64
  for row in data_rows:
65
+ row_size = len(row) + 1
 
 
66
  if (len(current_rows) >= max_rows_per_chunk or
67
  current_size + row_size > max_chunk_size) and current_rows:
68
 
69
  # Save current chunk
70
  chunk_text = header + '\n'.join(current_rows)
71
  chunks.append(chunk_text)
72
+ log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
73
+
74
  current_rows = [current_rows[-1]]
75
+ log_message(f"Перенос строки для перекрытия: {current_rows[-1]}")
76
  current_size = len(header) + len(current_rows[0]) + 1
77
 
78
  current_rows.append(row)
 
145
  }
146
  )
147
  if len(content) > 2000:
148
+ chunks = chunk_table_document(base_doc)
149
+ log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
150
  return chunk_table_document(base_doc)
151
+
152
 
153
  return [base_doc]
154
 
 
206
  stats['total_size'] += size
207
  stats['by_document'][document_id]['count'] += 1
208
  stats['by_document'][document_id]['size'] += size
209
+ log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
210
  else:
211
  docs_list = table_to_document(table_data, document_id)
212
  table_documents.extend(docs_list)
utils.py CHANGED
@@ -43,99 +43,6 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
46
- def format_context_for_llm(nodes):
47
- context_parts = []
48
-
49
- for node in nodes:
50
- metadata = node.metadata if hasattr(node, 'metadata') else {}
51
- doc_id = metadata.get('document_id', 'Неизвестный документ')
52
-
53
- section_info = ""
54
-
55
- # Handle section information with proper hierarchy
56
- if metadata.get('section_path'):
57
- section_path = metadata['section_path']
58
- section_text = metadata.get('section_text', '')
59
- parent_section = metadata.get('parent_section', '')
60
- parent_title = metadata.get('parent_title', '')
61
- level = metadata.get('level', '')
62
-
63
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
64
- # For subsections: раздел X (Title), пункт X.X
65
- if section_text:
66
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
67
- else:
68
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
69
- elif section_text:
70
- # For main sections: раздел X (Title)
71
- section_info = f"раздел {section_path} ({section_text})"
72
- else:
73
- section_info = f"раздел {section_path}"
74
-
75
- elif metadata.get('section_id'):
76
- section_id = metadata['section_id']
77
- section_text = metadata.get('section_text', '')
78
- level = metadata.get('level', '')
79
- parent_section = metadata.get('parent_section', '')
80
- parent_title = metadata.get('parent_title', '')
81
-
82
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
83
- if section_text:
84
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
85
- else:
86
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
87
- elif section_text:
88
- section_info = f"раздел {section_id} ({section_text})"
89
- else:
90
- section_info = f"раздел {section_id}"
91
-
92
- # Override with table/image info if applicable
93
- if metadata.get('type') == 'table' and metadata.get('table_number'):
94
- table_num = metadata['table_number']
95
- if not str(table_num).startswith('№'):
96
- table_num = f"№{table_num}"
97
- table_title = metadata.get('table_title', '')
98
- # Include section context for tables
99
- base_section = ""
100
- if metadata.get('section_path'):
101
- base_section = f", раздел {metadata['section_path']}"
102
- elif metadata.get('section_id'):
103
- base_section = f", раздел {metadata['section_id']}"
104
-
105
- if table_title:
106
- section_info = f"Таблица {table_num} ({table_title}){base_section}"
107
- else:
108
- section_info = f"Таблица {table_num}{base_section}"
109
-
110
- if metadata.get('type') == 'image' and metadata.get('image_number'):
111
- image_num = metadata['image_number']
112
- if not str(image_num).startswith('№'):
113
- image_num = f"№{image_num}"
114
- image_title = metadata.get('image_title', '')
115
- # Include section context for images
116
- base_section = ""
117
- if metadata.get('section_path'):
118
- base_section = f", раздел {metadata['section_path']}"
119
- elif metadata.get('section_id'):
120
- base_section = f", раздел {metadata['section_id']}"
121
-
122
- if image_title:
123
- section_info = f"Рисунок {image_num} ({image_title}){base_section}"
124
- else:
125
- section_info = f"Рисунок {image_num}{base_section}"
126
-
127
- context_text = node.text if hasattr(node, 'text') else str(node)
128
-
129
- if section_info:
130
- formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
131
- else:
132
- formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
133
-
134
- context_parts.append(formatted_context)
135
-
136
- return "\n".join(context_parts)
137
-
138
-
139
  def generate_sources_html(nodes, chunks_df=None):
140
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
141
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
 
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def generate_sources_html(nodes, chunks_df=None):
47
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
48
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"