MrSimple07 commited on
Commit
f0cb4f3
·
1 Parent(s): aa38fcf

new documents_prep

Browse files
Files changed (3) hide show
  1. config.py +0 -1
  2. documents_prep.py +342 -418
  3. documents_prep_1.py +488 -0
config.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
 
3
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
4
- RETRIEVER_TOP_K = 15
5
  SIMILARITY_THRESHOLD = 0.7
6
  RAG_FILES_DIR = "rag_files"
7
  PROCESSED_DATA_FILE = "processed_chunks.csv"
 
1
  import os
2
 
3
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 
4
  SIMILARITY_THRESHOLD = 0.7
5
  RAG_FILES_DIR = "rag_files"
6
  PROCESSED_DATA_FILE = "processed_chunks.csv"
documents_prep.py CHANGED
@@ -3,486 +3,410 @@ import zipfile
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
6
- from my_logging import log_message
7
  from llama_index.core.text_splitter import SentenceSplitter
 
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
- from table_prep import table_to_document, load_table_data
10
-
11
 
12
- def chunk_document(doc, chunk_size=None, chunk_overlap=None):
13
- if chunk_size is None:
14
- chunk_size = CHUNK_SIZE
15
- if chunk_overlap is None:
16
- chunk_overlap = CHUNK_OVERLAP
17
- text_splitter = SentenceSplitter(
18
- chunk_size=chunk_size,
19
- chunk_overlap=chunk_overlap,
20
- separator=" "
21
- )
22
-
23
- text_chunks = text_splitter.split_text(doc.text)
24
-
25
- chunked_docs = []
26
- for i, chunk_text in enumerate(text_chunks):
27
- chunk_metadata = doc.metadata.copy()
28
- chunk_metadata.update({
29
- "chunk_id": i,
30
- "total_chunks": len(text_chunks),
31
- "chunk_size": len(chunk_text),
32
- "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
33
- })
34
-
35
- chunked_doc = Document(
36
- text=chunk_text,
37
- metadata=chunk_metadata
38
- )
39
- chunked_docs.append(chunked_doc)
40
 
41
- return chunked_docs
42
-
43
- def process_documents_with_chunking(documents):
44
- all_chunked_docs = []
45
  chunk_info = []
46
- table_count = 0
47
- table_chunks_count = 0
48
- image_count = 0
49
- image_chunks_count = 0
50
- text_chunks_count = 0
51
 
52
- for doc in documents:
53
- doc_type = doc.metadata.get('type', 'text')
54
- is_already_chunked = doc.metadata.get('is_chunked', False)
55
 
56
- if doc_type == 'table':
57
- if is_already_chunked:
58
- table_chunks_count += 1
59
- all_chunked_docs.append(doc)
60
- chunk_info.append({
61
- 'document_id': doc.metadata.get('document_id', 'unknown'),
62
- 'section_id': doc.metadata.get('section_id', 'unknown'),
63
- 'chunk_id': doc.metadata.get('chunk_id', 0),
64
- 'total_chunks': doc.metadata.get('total_chunks', 1),
65
- 'chunk_size': len(doc.text),
66
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
67
- 'type': 'table',
68
- 'table_number': doc.metadata.get('table_number', 'unknown')
69
- })
70
- else:
71
- table_count += 1
72
- all_chunked_docs.append(doc)
73
- chunk_info.append({
74
- 'document_id': doc.metadata.get('document_id', 'unknown'),
75
- 'section_id': doc.metadata.get('section_id', 'unknown'),
76
- 'chunk_id': 0,
77
- 'chunk_size': len(doc.text),
78
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
79
- 'type': 'table',
80
- 'table_number': doc.metadata.get('table_number', 'unknown')
81
- })
82
 
83
- elif doc_type == 'image':
84
- image_count += 1
85
- doc_size = len(doc.text)
86
- if doc_size > CHUNK_SIZE:
87
- log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
88
- f"Размер: {doc_size} > {CHUNK_SIZE}")
89
- chunked_docs = chunk_document(doc)
90
- image_chunks_count += len(chunked_docs)
91
- all_chunked_docs.extend(chunked_docs)
92
- log_message(f" ✂️ Разделено на {len(chunked_docs)} чанков")
93
-
94
- for i, chunk_doc in enumerate(chunked_docs):
95
- chunk_info.append({
96
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
97
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
98
- 'chunk_id': i,
99
- 'chunk_size': len(chunk_doc.text),
100
- 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
101
- 'type': 'image',
102
- 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
103
- })
104
- else:
105
- all_chunked_docs.append(doc)
106
- chunk_info.append({
107
- 'document_id': doc.metadata.get('document_id', 'unknown'),
108
- 'section_id': doc.metadata.get('section_id', 'unknown'),
109
- 'chunk_id': 0,
110
- 'chunk_size': doc_size,
111
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
112
- 'type': 'image',
113
- 'image_number': doc.metadata.get('image_number', 'unknown')
114
- })
115
 
116
- else:
117
- doc_size = len(doc.text)
118
- if doc_size > CHUNK_SIZE:
119
- log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
120
- f"Размер: {doc_size} > {CHUNK_SIZE}")
121
- chunked_docs = chunk_document(doc)
122
- text_chunks_count += len(chunked_docs)
123
- all_chunked_docs.extend(chunked_docs)
124
- log_message(f" ✂️ Разделен на {len(chunked_docs)} чанков")
125
 
126
- for i, chunk_doc in enumerate(chunked_docs):
127
- chunk_info.append({
128
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
129
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
130
- 'chunk_id': i,
131
- 'chunk_size': len(chunk_doc.text),
132
- 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
133
- 'type': 'text'
134
- })
135
- else:
136
- all_chunked_docs.append(doc)
137
- chunk_info.append({
138
- 'document_id': doc.metadata.get('document_id', 'unknown'),
139
- 'section_id': doc.metadata.get('section_id', 'unknown'),
140
- 'chunk_id': 0,
141
- 'chunk_size': doc_size,
142
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
143
- 'type': 'text'
144
- })
 
145
 
146
- log_message(f"\n{'='*60}")
147
- log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
148
- log_message(f" • Таблицы (целые): {table_count}")
149
- log_message(f" • Таблицы (чанки): {table_chunks_count}")
150
- log_message(f" Изображения (целые): {image_count - (image_chunks_count > 0)}")
151
- log_message(f" • Изображения (чанки): {image_chunks_count}")
152
- log_message(f" • Текстовые чанки: {text_chunks_count}")
153
- log_message(f" • Всего документов: {len(all_chunked_docs)}")
154
- log_message(f"{'='*60}\n")
155
-
156
- return all_chunked_docs, chunk_info
157
 
158
- def extract_text_from_json(data, document_id, document_name):
159
- documents = []
 
 
 
 
 
 
160
 
161
- if 'sections' in data:
162
- for section in data['sections']:
163
- section_id = section.get('section_id', 'Unknown')
164
- section_text = section.get('section_text', '')
165
-
166
- section_path = f"{section_id}"
167
- section_title = extract_section_title(section_text)
168
-
169
- if section_text.strip():
170
- doc = Document(
171
- text=section_text,
172
- metadata={
173
- "type": "text",
174
- "document_id": document_id,
175
- "document_name": document_name,
176
- "section_id": section_id,
177
- "section_text": section_title[:200],
178
- "section_path": section_path,
179
- "level": "section"
180
- }
181
- )
182
- documents.append(doc)
183
-
184
- if 'subsections' in section:
185
- for subsection in section['subsections']:
186
- subsection_id = subsection.get('subsection_id', 'Unknown')
187
- subsection_text = subsection.get('subsection_text', '')
188
- subsection_title = extract_section_title(subsection_text)
189
- subsection_path = f"{section_path}.{subsection_id}"
190
-
191
- if subsection_text.strip():
192
- doc = Document(
193
- text=subsection_text,
194
- metadata={
195
- "type": "text",
196
- "document_id": document_id,
197
- "document_name": document_name,
198
- "section_id": subsection_id,
199
- "section_text": subsection_title[:200],
200
- "section_path": subsection_path,
201
- "level": "subsection",
202
- "parent_section": section_id,
203
- "parent_title": section_title[:100]
204
- }
205
- )
206
- documents.append(doc)
207
-
208
- if 'sub_subsections' in subsection:
209
- for sub_subsection in subsection['sub_subsections']:
210
- sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
211
- sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
212
- sub_subsection_title = extract_section_title(sub_subsection_text)
213
- sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
214
-
215
- if sub_subsection_text.strip():
216
- doc = Document(
217
- text=sub_subsection_text,
218
- metadata={
219
- "type": "text",
220
- "document_id": document_id,
221
- "document_name": document_name,
222
- "section_id": sub_subsection_id,
223
- "section_text": sub_subsection_title[:200],
224
- "section_path": sub_subsection_path,
225
- "level": "sub_subsection",
226
- "parent_section": subsection_id,
227
- "parent_title": subsection_title[:100]
228
- }
229
- )
230
- documents.append(doc)
231
-
232
- if 'sub_sub_subsections' in sub_subsection:
233
- for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
234
- sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
235
- sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
236
- sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
237
-
238
- if sub_sub_subsection_text.strip():
239
- doc = Document(
240
- text=sub_sub_subsection_text,
241
- metadata={
242
- "type": "text",
243
- "document_id": document_id,
244
- "document_name": document_name,
245
- "section_id": sub_sub_subsection_id,
246
- "section_text": sub_sub_subsection_title[:200],
247
- "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
248
- "level": "sub_sub_subsection",
249
- "parent_section": sub_subsection_id,
250
- "parent_title": sub_subsection_title[:100]
251
- }
252
- )
253
- documents.append(doc)
254
-
255
- return documents
256
 
257
- def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
258
- log_message("Начинаю загрузку JSON документов")
 
 
259
 
260
  try:
261
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
262
- zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
263
- json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
264
 
265
- log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
266
 
267
- all_documents = []
268
-
269
- for zip_file_path in zip_files:
270
  try:
271
- log_message(f"Загружаю ZIP архив: {zip_file_path}")
272
- local_zip_path = hf_hub_download(
273
  repo_id=repo_id,
274
- filename=zip_file_path,
275
- local_dir=download_dir,
276
- repo_type="dataset",
277
- token=hf_token
278
  )
279
 
280
- documents = extract_zip_and_process_json(local_zip_path)
281
- all_documents.extend(documents)
282
- log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
283
-
284
- except Exception as e:
285
- log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
286
- continue
287
-
288
- for file_path in json_files:
289
- try:
290
- log_message(f"Обрабатываю прямой JSON файл: {file_path}")
291
- local_path = hf_hub_download(
292
- repo_id=repo_id,
293
- filename=file_path,
294
- local_dir=download_dir,
295
- repo_type="dataset",
296
- token=hf_token
297
- )
298
-
299
- with open(local_path, 'r', encoding='utf-8') as f:
300
- json_data = json.load(f)
301
-
302
- document_metadata = json_data.get('document_metadata', {})
303
- document_id = document_metadata.get('document_id', 'unknown')
304
- document_name = document_metadata.get('document_name', 'unknown')
305
-
306
- documents = extract_text_from_json(json_data, document_id, document_name)
307
- all_documents.extend(documents)
308
-
309
- log_message(f"Извлечено {len(documents)} документов из {file_path}")
310
 
 
 
 
 
311
  except Exception as e:
312
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
313
- continue
314
-
315
- log_message(f"Всего создано {len(all_documents)} исход��ых документов из JSON файлов")
316
-
317
- # Process documents through chunking function
318
- chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
319
-
320
- log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
321
-
322
- return chunked_documents, chunk_info
323
 
 
 
 
324
  except Exception as e:
325
- log_message(f"Ошибка загрузки JSON документов: {str(e)}")
326
- return [], []
327
 
328
- def extract_section_title(section_text):
329
- if not section_text.strip():
330
- return ""
 
 
 
 
331
 
332
- lines = section_text.strip().split('\n')
333
- first_line = lines[0].strip()
334
 
335
- if len(first_line) < 200 and not first_line.endswith('.'):
336
- return first_line
337
 
338
- # Otherwise, extract first sentence
339
- sentences = first_line.split('.')
340
- if len(sentences) > 1:
341
- return sentences[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
- return first_line[:100] + "..." if len(first_line) > 100 else first_line
344
 
345
- def extract_zip_and_process_json(zip_path):
346
- documents = []
347
 
348
- try:
349
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
350
- zip_files = zip_ref.namelist()
351
- json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
352
-
353
- log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
354
-
355
- for json_file in json_files:
356
- try:
357
- log_message(f"Обрабатываю файл из архива: {json_file}")
358
-
359
- with zip_ref.open(json_file) as f:
360
- json_data = json.load(f)
361
-
362
- document_metadata = json_data.get('document_metadata', {})
363
- document_id = document_metadata.get('document_id', 'unknown')
364
- document_name = document_metadata.get('document_name', 'unknown')
365
-
366
- docs = extract_text_from_json(json_data, document_id, document_name)
367
- documents.extend(docs)
368
-
369
- log_message(f"Извлечено {len(docs)} документов из {json_file}")
370
-
371
- except Exception as e:
372
- log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
373
- continue
374
 
375
- except Exception as e:
376
- log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
377
 
378
- return documents
 
 
 
 
379
 
380
  def load_image_data(repo_id, hf_token, image_data_dir):
381
- log_message("Начинаю загрузку данных изображений")
 
 
382
 
383
- image_files = []
384
  try:
385
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
386
- for file in files:
387
- if file.startswith(image_data_dir) and file.endswith('.csv'):
388
- image_files.append(file)
389
 
390
- log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
391
 
392
- image_documents = []
393
- for file_path in image_files:
394
  try:
395
- log_message(f"Обрабатываю файл изображений: {file_path}")
396
- local_path = hf_hub_download(
397
  repo_id=repo_id,
398
- filename=file_path,
399
- local_dir='',
400
- repo_type="dataset",
401
- token=hf_token
402
  )
403
 
404
- df = pd.read_csv(local_path)
405
- log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
406
 
407
- # Обработка с правильными названиями колонок
408
- for _, row in df.iterrows():
409
- section_value = row.get('Раздел документа', 'Неизвестно')
410
-
411
- content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
412
- content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
413
- content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n" # Опечатка в названии колонки
414
- content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
415
- content += f"Раздел: {section_value}\n"
416
- content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
417
-
418
- doc = Document(
419
- text=content,
420
- metadata={
421
- "type": "image",
422
- "image_number": str(row.get('№ Изображения', 'unknown')),
423
- "image_title": str(row.get('Название изображения', 'unknown')),
424
- "image_description": str(row.get('Описание изображение', 'unknown')),
425
- "document_id": str(row.get('Обозначение документа', 'unknown')),
426
- "file_path": str(row.get('Файл изображения', 'unknown')),
427
- "section": str(section_value),
428
- "section_id": str(section_value)
429
- }
430
- )
431
- image_documents.append(doc)
432
-
433
  except Exception as e:
434
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
435
- continue
436
-
437
- log_message(f"Создано {len(image_documents)} документов из изображений")
438
- return image_documents
439
 
 
 
 
440
  except Exception as e:
441
- log_message(f"Ошибка загрузки данных изображений: {str(e)}")
442
  return []
443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
446
- log_message("Загружаю данные чанков из CSV")
447
 
448
  try:
449
- chunks_csv_path = hf_hub_download(
450
  repo_id=repo_id,
451
  filename=chunks_filename,
452
- local_dir=download_dir,
453
  repo_type="dataset",
454
- token=hf_token
455
  )
456
 
457
- chunks_df = pd.read_csv(chunks_csv_path)
458
- log_message(f"Загружено {len(chunks_df)} чанков из CSV")
459
 
460
- text_column = None
461
- for col in chunks_df.columns:
462
- if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
463
- text_column = col
464
- break
465
-
466
- if text_column is None:
467
- text_column = chunks_df.columns[0]
 
 
 
 
 
468
 
469
- log_message(f"Использую колонку: {text_column}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
- documents = []
472
- for i, (_, row) in enumerate(chunks_df.iterrows()):
473
- doc = Document(
474
- text=str(row[text_column]),
475
- metadata={
476
- "chunk_id": row.get('chunk_id', i),
477
- "document_id": row.get('document_id', 'unknown'),
478
- "type": "text"
479
- }
480
- )
481
- documents.append(doc)
482
 
483
- log_message(f"Создано {len(documents)} текстовых документов из CSV")
484
- return documents, chunks_df
 
485
 
486
- except Exception as e:
487
- log_message(f"Ошибка загрузки CSV данных: {str(e)}")
488
- return [], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
 
6
  from llama_index.core.text_splitter import SentenceSplitter
7
+ from my_logging import log_message
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
+ import os
 
10
 
11
+ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
12
+ log_message(f"Загрузка JSON документов из {json_files_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ documents = []
 
 
 
15
  chunk_info = []
 
 
 
 
 
16
 
17
+ try:
18
+ files = list_repo_files(repo_id, token=hf_token)
19
+ zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
20
 
21
+ log_message(f"Найдено {len(zip_files)} ZIP файлов")
22
+
23
+ for zip_file in zip_files:
24
+ zip_path = hf_hub_download(
25
+ repo_id=repo_id,
26
+ filename=zip_file,
27
+ token=hf_token,
28
+ repo_type="dataset",
29
+ local_dir=download_dir
30
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ log_message(f"Обрабатываю архив: {zip_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
35
+ json_files = [f for f in zip_ref.namelist()
36
+ if f.endswith('.json') and not f.startswith('__MACOSX')]
 
 
 
 
 
 
37
 
38
+ log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
39
+
40
+ for json_file in json_files:
41
+ try:
42
+ with zip_ref.open(json_file) as f:
43
+ json_data = json.load(f)
44
+
45
+ doc_id = json_data.get('document_id', os.path.basename(json_file))
46
+ sections = json_data.get('sections', [])
47
+
48
+ log_message(f"Обработка документа {doc_id}: {len(sections)} разделов")
49
+
50
+ for section in sections:
51
+ doc, info = process_text_section(section, doc_id)
52
+ if doc:
53
+ documents.append(doc)
54
+ chunk_info.append(info)
55
+
56
+ except Exception as e:
57
+ log_message(f"Ошибка при обработке {json_file}: {str(e)}")
58
 
59
+ log_message(f"Загружено {len(documents)} текстовых документов")
60
+ return documents, chunk_info
61
+
62
+ except Exception as e:
63
+ log_message(f"Ошибка загрузки JSON: {str(e)}")
64
+ return [], []
 
 
 
 
 
65
 
66
+ def process_text_section(section, doc_id):
67
+ section_id = section.get('section_id', 'unknown')
68
+ section_path = section.get('section_path', '')
69
+ section_text = section.get('section_text', '')
70
+ section_content = section.get('section_content', '')
71
+ parent_section = section.get('parent_section', '')
72
+ parent_title = section.get('parent_title', '')
73
+ level = section.get('level', 'section')
74
 
75
+ full_text = f"{section_text}\n{section_content}".strip()
76
+
77
+ if not full_text:
78
+ return None, None
79
+
80
+ metadata = {
81
+ 'document_id': doc_id,
82
+ 'section_id': section_id,
83
+ 'section_path': section_path,
84
+ 'section_text': section_text,
85
+ 'parent_section': parent_section,
86
+ 'parent_title': parent_title,
87
+ 'level': level,
88
+ 'type': 'text',
89
+ 'chunk_text': full_text
90
+ }
91
+
92
+ doc = Document(
93
+ text=full_text,
94
+ metadata=metadata
95
+ )
96
+
97
+ chunk_info = {
98
+ 'document_id': doc_id,
99
+ 'section_id': section_id,
100
+ 'section_path': section_path,
101
+ 'section_text': section_text,
102
+ 'parent_section': parent_section,
103
+ 'parent_title': parent_title,
104
+ 'level': level,
105
+ 'type': 'text',
106
+ 'chunk_text': full_text
107
+ }
108
+
109
+ return doc, chunk_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ def load_table_data(repo_id, hf_token, table_data_dir):
112
+ log_message(f"Загрузка табличных данных из {table_data_dir}")
113
+
114
+ documents = []
115
 
116
  try:
117
+ files = list_repo_files(repo_id, token=hf_token)
118
+ json_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
 
119
 
120
+ log_message(f"Найдено {len(json_files)} табличных JSON файлов")
121
 
122
+ for json_file in json_files:
 
 
123
  try:
124
+ file_path = hf_hub_download(
 
125
  repo_id=repo_id,
126
+ filename=json_file,
127
+ token=hf_token,
128
+ repo_type="dataset"
 
129
  )
130
 
131
+ with open(file_path, 'r', encoding='utf-8') as f:
132
+ table_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ doc = create_table_document(table_data)
135
+ if doc:
136
+ documents.append(doc)
137
+
138
  except Exception as e:
139
+ log_message(f"Ошибка при обработке таблицы {json_file}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
140
 
141
+ log_message(f"Загружено {len(documents)} табличных документов")
142
+ return documents
143
+
144
  except Exception as e:
145
+ log_message(f"Ошибка загрузки таблиц: {str(e)}")
146
+ return []
147
 
148
+ def create_table_document(table_data):
149
+ doc_id = table_data.get('document_id', 'unknown')
150
+ table_number = table_data.get('table_number', 'unknown')
151
+ table_title = table_data.get('table_title', '')
152
+ section = table_data.get('section', '')
153
+ headers = table_data.get('headers', [])
154
+ data = table_data.get('data', [])
155
 
156
+ if not data:
157
+ return None
158
 
159
+ token_count = estimate_tokens(str(table_data))
 
160
 
161
+ if token_count < 2000:
162
+ text = format_table_as_text(table_number, table_title, section, headers, data)
163
+
164
+ metadata = {
165
+ 'document_id': doc_id,
166
+ 'table_number': table_number,
167
+ 'table_title': table_title,
168
+ 'section': section,
169
+ 'type': 'table',
170
+ 'headers': str(headers),
171
+ 'row_count': len(data)
172
+ }
173
+
174
+ return Document(text=text, metadata=metadata)
175
+ else:
176
+ return create_chunked_table_document(
177
+ doc_id, table_number, table_title, section, headers, data
178
+ )
179
+
180
+ def create_chunked_table_document(doc_id, table_number, table_title, section, headers, data, rows_per_chunk=30):
181
+ chunks = []
182
+
183
+ for i in range(0, len(data), rows_per_chunk):
184
+ chunk_rows = data[i:i+rows_per_chunk]
185
+
186
+ text = format_table_as_text(
187
+ table_number,
188
+ table_title,
189
+ section,
190
+ headers,
191
+ chunk_rows,
192
+ chunk_info=f"строки {i+1}-{i+len(chunk_rows)}"
193
+ )
194
+
195
+ metadata = {
196
+ 'document_id': doc_id,
197
+ 'table_number': table_number,
198
+ 'table_title': table_title,
199
+ 'section': section,
200
+ 'type': 'table',
201
+ 'headers': str(headers),
202
+ 'chunk_index': i // rows_per_chunk,
203
+ 'row_start': i,
204
+ 'row_end': i + len(chunk_rows),
205
+ 'row_count': len(chunk_rows)
206
+ }
207
+
208
+ chunks.append(Document(text=text, metadata=metadata))
209
 
210
+ return chunks[0] if len(chunks) == 1 else chunks
211
 
212
+ def format_table_as_text(table_number, table_title, section, headers, data, chunk_info=""):
213
+ text_parts = []
214
 
215
+ text_parts.append(f"Таблица {table_number}")
216
+ if table_title:
217
+ text_parts.append(f"Название: {table_title}")
218
+ if section:
219
+ text_parts.append(f"Раздел: {section}")
220
+ if chunk_info:
221
+ text_parts.append(f"({chunk_info})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ text_parts.append(f"\nЗаголовки: {', '.join(headers)}")
224
+ text_parts.append("\nДанные:")
225
 
226
+ for row in data[:100]:
227
+ row_text = " | ".join([str(cell) for cell in row])
228
+ text_parts.append(row_text)
229
+
230
+ return "\n".join(text_parts)
231
 
232
  def load_image_data(repo_id, hf_token, image_data_dir):
233
+ log_message(f"Загрузка данных изображений из {image_data_dir}")
234
+
235
+ documents = []
236
 
 
237
  try:
238
+ files = list_repo_files(repo_id, token=hf_token)
239
+ json_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.json')]
 
 
240
 
241
+ log_message(f"Найдено {len(json_files)} JSON файлов изображений")
242
 
243
+ for json_file in json_files:
 
244
  try:
245
+ file_path = hf_hub_download(
 
246
  repo_id=repo_id,
247
+ filename=json_file,
248
+ token=hf_token,
249
+ repo_type="dataset"
 
250
  )
251
 
252
+ with open(file_path, 'r', encoding='utf-8') as f:
253
+ image_data = json.load(f)
254
 
255
+ doc = create_image_document(image_data)
256
+ if doc:
257
+ documents.append(doc)
258
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  except Exception as e:
260
+ log_message(f"Ошибка при обработке изображения {json_file}: {str(e)}")
 
 
 
 
261
 
262
+ log_message(f"Загружено {len(documents)} документов изображений")
263
+ return documents
264
+
265
  except Exception as e:
266
+ log_message(f"Ошибка загрузки изображений: {str(e)}")
267
  return []
268
 
269
+ def create_image_document(image_data):
270
+ doc_id = image_data.get('document_id', 'unknown')
271
+ image_number = image_data.get('image_number', 'unknown')
272
+ image_title = image_data.get('image_title', '')
273
+ image_description = image_data.get('image_description', '')
274
+ section = image_data.get('section', '')
275
+
276
+ text_parts = []
277
+ text_parts.append(f"Рисунок {image_number}")
278
+ if image_title:
279
+ text_parts.append(f"Название: {image_title}")
280
+ if section:
281
+ text_parts.append(f"Раздел: {section}")
282
+ if image_description:
283
+ text_parts.append(f"Описание: {image_description}")
284
+
285
+ text = "\n".join(text_parts)
286
+
287
+ metadata = {
288
+ 'document_id': doc_id,
289
+ 'image_number': image_number,
290
+ 'image_title': image_title,
291
+ 'section': section,
292
+ 'type': 'image'
293
+ }
294
+
295
+ return Document(text=text, metadata=metadata)
296
 
297
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
298
+ log_message(f"Загрузка CSV чанков из {chunks_filename}")
299
 
300
  try:
301
+ csv_path = hf_hub_download(
302
  repo_id=repo_id,
303
  filename=chunks_filename,
304
+ token=hf_token,
305
  repo_type="dataset",
306
+ local_dir=download_dir
307
  )
308
 
309
+ df = pd.read_csv(csv_path)
310
+ log_message(f"Загружено {len(df)} строк из CSV")
311
 
312
+ documents = []
313
+ for _, row in df.iterrows():
314
+ metadata = {
315
+ 'document_id': row.get('document_id', 'unknown'),
316
+ 'section_id': row.get('section_id', 'unknown'),
317
+ 'section_path': row.get('section_path', ''),
318
+ 'type': 'text'
319
+ }
320
+
321
+ text = row.get('chunk_text', '')
322
+ if text:
323
+ doc = Document(text=text, metadata=metadata)
324
+ documents.append(doc)
325
 
326
+ log_message(f"Создано {len(documents)} документов из CSV")
327
+ return documents, df
328
+
329
+ except Exception as e:
330
+ log_message(f"Ошибка загрузки CSV: {str(e)}")
331
+ return [], None
332
+
333
+ def process_documents_with_chunking(documents):
334
+ log_message(f"Чанкинг {len(documents)} документов")
335
+
336
+ text_splitter = SentenceSplitter(
337
+ chunk_size=CHUNK_SIZE,
338
+ chunk_overlap=CHUNK_OVERLAP,
339
+ separator=" ",
340
+ backup_separators=["\n", ".", "!", "?"]
341
+ )
342
+
343
+ chunked_documents = []
344
+ chunk_info = []
345
+
346
+ for doc in documents:
347
+ doc_type = doc.metadata.get('type', 'text')
348
 
349
+ if doc_type == 'table':
350
+ if isinstance(doc, list):
351
+ chunked_documents.extend(doc)
352
+ for d in doc:
353
+ chunk_info.append(create_chunk_info(d))
354
+ else:
355
+ chunked_documents.append(doc)
356
+ chunk_info.append(create_chunk_info(doc))
 
 
 
357
 
358
+ elif doc_type == 'image':
359
+ chunked_documents.append(doc)
360
+ chunk_info.append(create_chunk_info(doc))
361
 
362
+ else:
363
+ token_count = estimate_tokens(doc.text)
364
+
365
+ if token_count <= CHUNK_SIZE:
366
+ chunked_documents.append(doc)
367
+ chunk_info.append(create_chunk_info(doc))
368
+ else:
369
+ nodes = text_splitter.get_nodes_from_documents([doc])
370
+
371
+ for node in nodes:
372
+ new_doc = Document(
373
+ text=node.text,
374
+ metadata=doc.metadata
375
+ )
376
+ chunked_documents.append(new_doc)
377
+ chunk_info.append(create_chunk_info(new_doc))
378
+
379
+ log_message(f"Получено {len(chunked_documents)} чанков после обработки")
380
+ return chunked_documents, chunk_info
381
+
382
+ def create_chunk_info(doc):
383
+ metadata = doc.metadata
384
+
385
+ info = {
386
+ 'document_id': metadata.get('document_id', 'unknown'),
387
+ 'type': metadata.get('type', 'text'),
388
+ 'chunk_text': doc.text[:500]
389
+ }
390
+
391
+ if metadata.get('type') == 'table':
392
+ info['table_number'] = metadata.get('table_number', 'unknown')
393
+ info['table_title'] = metadata.get('table_title', '')
394
+ info['section'] = metadata.get('section', '')
395
+
396
+ elif metadata.get('type') == 'image':
397
+ info['image_number'] = metadata.get('image_number', 'unknown')
398
+ info['image_title'] = metadata.get('image_title', '')
399
+ info['section'] = metadata.get('section', '')
400
+
401
+ else:
402
+ info['section_id'] = metadata.get('section_id', 'unknown')
403
+ info['section_path'] = metadata.get('section_path', '')
404
+ info['section_text'] = metadata.get('section_text', '')
405
+ info['parent_section'] = metadata.get('parent_section', '')
406
+ info['parent_title'] = metadata.get('parent_title', '')
407
+ info['level'] = metadata.get('level', 'section')
408
+
409
+ return info
410
+
411
+ def estimate_tokens(text):
412
+ return len(text.split()) * 1.3
documents_prep_1.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import zipfile
3
+ import pandas as pd
4
+ from huggingface_hub import hf_hub_download, list_repo_files
5
+ from llama_index.core import Document
6
+ from my_logging import log_message
7
+ from llama_index.core.text_splitter import SentenceSplitter
8
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
9
+ from table_prep import table_to_document, load_table_data
10
+
11
+
12
+ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
13
+ if chunk_size is None:
14
+ chunk_size = CHUNK_SIZE
15
+ if chunk_overlap is None:
16
+ chunk_overlap = CHUNK_OVERLAP
17
+ text_splitter = SentenceSplitter(
18
+ chunk_size=chunk_size,
19
+ chunk_overlap=chunk_overlap,
20
+ separator=" "
21
+ )
22
+
23
+ text_chunks = text_splitter.split_text(doc.text)
24
+
25
+ chunked_docs = []
26
+ for i, chunk_text in enumerate(text_chunks):
27
+ chunk_metadata = doc.metadata.copy()
28
+ chunk_metadata.update({
29
+ "chunk_id": i,
30
+ "total_chunks": len(text_chunks),
31
+ "chunk_size": len(chunk_text),
32
+ "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
33
+ })
34
+
35
+ chunked_doc = Document(
36
+ text=chunk_text,
37
+ metadata=chunk_metadata
38
+ )
39
+ chunked_docs.append(chunked_doc)
40
+
41
+ return chunked_docs
42
+
43
+ def process_documents_with_chunking(documents):
44
+ all_chunked_docs = []
45
+ chunk_info = []
46
+ table_count = 0
47
+ table_chunks_count = 0
48
+ image_count = 0
49
+ image_chunks_count = 0
50
+ text_chunks_count = 0
51
+
52
+ for doc in documents:
53
+ doc_type = doc.metadata.get('type', 'text')
54
+ is_already_chunked = doc.metadata.get('is_chunked', False)
55
+
56
+ if doc_type == 'table':
57
+ if is_already_chunked:
58
+ table_chunks_count += 1
59
+ all_chunked_docs.append(doc)
60
+ chunk_info.append({
61
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
62
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
63
+ 'chunk_id': doc.metadata.get('chunk_id', 0),
64
+ 'total_chunks': doc.metadata.get('total_chunks', 1),
65
+ 'chunk_size': len(doc.text),
66
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
67
+ 'type': 'table',
68
+ 'table_number': doc.metadata.get('table_number', 'unknown')
69
+ })
70
+ else:
71
+ table_count += 1
72
+ all_chunked_docs.append(doc)
73
+ chunk_info.append({
74
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
75
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
76
+ 'chunk_id': 0,
77
+ 'chunk_size': len(doc.text),
78
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
79
+ 'type': 'table',
80
+ 'table_number': doc.metadata.get('table_number', 'unknown')
81
+ })
82
+
83
+ elif doc_type == 'image':
84
+ image_count += 1
85
+ doc_size = len(doc.text)
86
+ if doc_size > CHUNK_SIZE:
87
+ log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
88
+ f"Размер: {doc_size} > {CHUNK_SIZE}")
89
+ chunked_docs = chunk_document(doc)
90
+ image_chunks_count += len(chunked_docs)
91
+ all_chunked_docs.extend(chunked_docs)
92
+ log_message(f" ✂️ Разделено на {len(chunked_docs)} чанков")
93
+
94
+ for i, chunk_doc in enumerate(chunked_docs):
95
+ chunk_info.append({
96
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
97
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
98
+ 'chunk_id': i,
99
+ 'chunk_size': len(chunk_doc.text),
100
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
101
+ 'type': 'image',
102
+ 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
103
+ })
104
+ else:
105
+ all_chunked_docs.append(doc)
106
+ chunk_info.append({
107
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
108
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
109
+ 'chunk_id': 0,
110
+ 'chunk_size': doc_size,
111
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
112
+ 'type': 'image',
113
+ 'image_number': doc.metadata.get('image_number', 'unknown')
114
+ })
115
+
116
+ else:
117
+ doc_size = len(doc.text)
118
+ if doc_size > CHUNK_SIZE:
119
+ log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
120
+ f"Размер: {doc_size} > {CHUNK_SIZE}")
121
+ chunked_docs = chunk_document(doc)
122
+ text_chunks_count += len(chunked_docs)
123
+ all_chunked_docs.extend(chunked_docs)
124
+ log_message(f" ✂️ Разделен на {len(chunked_docs)} чанков")
125
+
126
+ for i, chunk_doc in enumerate(chunked_docs):
127
+ chunk_info.append({
128
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
129
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
130
+ 'chunk_id': i,
131
+ 'chunk_size': len(chunk_doc.text),
132
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
133
+ 'type': 'text'
134
+ })
135
+ else:
136
+ all_chunked_docs.append(doc)
137
+ chunk_info.append({
138
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
139
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
140
+ 'chunk_id': 0,
141
+ 'chunk_size': doc_size,
142
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
143
+ 'type': 'text'
144
+ })
145
+
146
+ log_message(f"\n{'='*60}")
147
+ log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
148
+ log_message(f" • Таблицы (целые): {table_count}")
149
+ log_message(f" • Таблицы (чанки): {table_chunks_count}")
150
+ log_message(f" • Изображения (целые): {image_count - (image_chunks_count > 0)}")
151
+ log_message(f" • Изображения (чанки): {image_chunks_count}")
152
+ log_message(f" • Текстовые чанки: {text_chunks_count}")
153
+ log_message(f" • Всего документов: {len(all_chunked_docs)}")
154
+ log_message(f"{'='*60}\n")
155
+
156
+ return all_chunked_docs, chunk_info
157
+
158
+ def extract_text_from_json(data, document_id, document_name):
159
+ documents = []
160
+
161
+ if 'sections' in data:
162
+ for section in data['sections']:
163
+ section_id = section.get('section_id', 'Unknown')
164
+ section_text = section.get('section_text', '')
165
+
166
+ section_path = f"{section_id}"
167
+ section_title = extract_section_title(section_text)
168
+
169
+ if section_text.strip():
170
+ doc = Document(
171
+ text=section_text,
172
+ metadata={
173
+ "type": "text",
174
+ "document_id": document_id,
175
+ "document_name": document_name,
176
+ "section_id": section_id,
177
+ "section_text": section_title[:200],
178
+ "section_path": section_path,
179
+ "level": "section"
180
+ }
181
+ )
182
+ documents.append(doc)
183
+
184
+ if 'subsections' in section:
185
+ for subsection in section['subsections']:
186
+ subsection_id = subsection.get('subsection_id', 'Unknown')
187
+ subsection_text = subsection.get('subsection_text', '')
188
+ subsection_title = extract_section_title(subsection_text)
189
+ subsection_path = f"{section_path}.{subsection_id}"
190
+
191
+ if subsection_text.strip():
192
+ doc = Document(
193
+ text=subsection_text,
194
+ metadata={
195
+ "type": "text",
196
+ "document_id": document_id,
197
+ "document_name": document_name,
198
+ "section_id": subsection_id,
199
+ "section_text": subsection_title[:200],
200
+ "section_path": subsection_path,
201
+ "level": "subsection",
202
+ "parent_section": section_id,
203
+ "parent_title": section_title[:100]
204
+ }
205
+ )
206
+ documents.append(doc)
207
+
208
+ if 'sub_subsections' in subsection:
209
+ for sub_subsection in subsection['sub_subsections']:
210
+ sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
211
+ sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
212
+ sub_subsection_title = extract_section_title(sub_subsection_text)
213
+ sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
214
+
215
+ if sub_subsection_text.strip():
216
+ doc = Document(
217
+ text=sub_subsection_text,
218
+ metadata={
219
+ "type": "text",
220
+ "document_id": document_id,
221
+ "document_name": document_name,
222
+ "section_id": sub_subsection_id,
223
+ "section_text": sub_subsection_title[:200],
224
+ "section_path": sub_subsection_path,
225
+ "level": "sub_subsection",
226
+ "parent_section": subsection_id,
227
+ "parent_title": subsection_title[:100]
228
+ }
229
+ )
230
+ documents.append(doc)
231
+
232
+ if 'sub_sub_subsections' in sub_subsection:
233
+ for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
234
+ sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
235
+ sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
236
+ sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
237
+
238
+ if sub_sub_subsection_text.strip():
239
+ doc = Document(
240
+ text=sub_sub_subsection_text,
241
+ metadata={
242
+ "type": "text",
243
+ "document_id": document_id,
244
+ "document_name": document_name,
245
+ "section_id": sub_sub_subsection_id,
246
+ "section_text": sub_sub_subsection_title[:200],
247
+ "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
248
+ "level": "sub_sub_subsection",
249
+ "parent_section": sub_subsection_id,
250
+ "parent_title": sub_subsection_title[:100]
251
+ }
252
+ )
253
+ documents.append(doc)
254
+
255
+ return documents
256
+
257
+ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
258
+ log_message("Начинаю загрузку JSON документов")
259
+
260
+ try:
261
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
262
+ zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
263
+ json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
264
+
265
+ log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
266
+
267
+ all_documents = []
268
+
269
+ for zip_file_path in zip_files:
270
+ try:
271
+ log_message(f"Загружаю ZIP архив: {zip_file_path}")
272
+ local_zip_path = hf_hub_download(
273
+ repo_id=repo_id,
274
+ filename=zip_file_path,
275
+ local_dir=download_dir,
276
+ repo_type="dataset",
277
+ token=hf_token
278
+ )
279
+
280
+ documents = extract_zip_and_process_json(local_zip_path)
281
+ all_documents.extend(documents)
282
+ log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
283
+
284
+ except Exception as e:
285
+ log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
286
+ continue
287
+
288
+ for file_path in json_files:
289
+ try:
290
+ log_message(f"Обрабатываю прямой JSON файл: {file_path}")
291
+ local_path = hf_hub_download(
292
+ repo_id=repo_id,
293
+ filename=file_path,
294
+ local_dir=download_dir,
295
+ repo_type="dataset",
296
+ token=hf_token
297
+ )
298
+
299
+ with open(local_path, 'r', encoding='utf-8') as f:
300
+ json_data = json.load(f)
301
+
302
+ document_metadata = json_data.get('document_metadata', {})
303
+ document_id = document_metadata.get('document_id', 'unknown')
304
+ document_name = document_metadata.get('document_name', 'unknown')
305
+
306
+ documents = extract_text_from_json(json_data, document_id, document_name)
307
+ all_documents.extend(documents)
308
+
309
+ log_message(f"Извлечено {len(documents)} документов из {file_path}")
310
+
311
+ except Exception as e:
312
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
313
+ continue
314
+
315
+ log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
316
+
317
+ # Process documents through chunking function
318
+ chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
319
+
320
+ log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
321
+
322
+ return chunked_documents, chunk_info
323
+
324
+ except Exception as e:
325
+ log_message(f"Ошибка загрузки JSON документов: {str(e)}")
326
+ return [], []
327
+
328
+ def extract_section_title(section_text):
329
+ if not section_text.strip():
330
+ return ""
331
+
332
+ lines = section_text.strip().split('\n')
333
+ first_line = lines[0].strip()
334
+
335
+ if len(first_line) < 200 and not first_line.endswith('.'):
336
+ return first_line
337
+
338
+ # Otherwise, extract first sentence
339
+ sentences = first_line.split('.')
340
+ if len(sentences) > 1:
341
+ return sentences[0].strip()
342
+
343
+ return first_line[:100] + "..." if len(first_line) > 100 else first_line
344
+
345
+ def extract_zip_and_process_json(zip_path):
346
+ documents = []
347
+
348
+ try:
349
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
350
+ zip_files = zip_ref.namelist()
351
+ json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
352
+
353
+ log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
354
+
355
+ for json_file in json_files:
356
+ try:
357
+ log_message(f"Обрабатываю файл из архива: {json_file}")
358
+
359
+ with zip_ref.open(json_file) as f:
360
+ json_data = json.load(f)
361
+
362
+ document_metadata = json_data.get('document_metadata', {})
363
+ document_id = document_metadata.get('document_id', 'unknown')
364
+ document_name = document_metadata.get('document_name', 'unknown')
365
+
366
+ docs = extract_text_from_json(json_data, document_id, document_name)
367
+ documents.extend(docs)
368
+
369
+ log_message(f"Извлечено {len(docs)} документов из {json_file}")
370
+
371
+ except Exception as e:
372
+ log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
373
+ continue
374
+
375
+ except Exception as e:
376
+ log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
377
+
378
+ return documents
379
+
380
+ def load_image_data(repo_id, hf_token, image_data_dir):
381
+ log_message("Начинаю загрузку данных изображений")
382
+
383
+ image_files = []
384
+ try:
385
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
386
+ for file in files:
387
+ if file.startswith(image_data_dir) and file.endswith('.csv'):
388
+ image_files.append(file)
389
+
390
+ log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
391
+
392
+ image_documents = []
393
+ for file_path in image_files:
394
+ try:
395
+ log_message(f"Обрабатываю файл изображений: {file_path}")
396
+ local_path = hf_hub_download(
397
+ repo_id=repo_id,
398
+ filename=file_path,
399
+ local_dir='',
400
+ repo_type="dataset",
401
+ token=hf_token
402
+ )
403
+
404
+ df = pd.read_csv(local_path)
405
+ log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
406
+
407
+ # Обработка с правильными названиями колонок
408
+ for _, row in df.iterrows():
409
+ section_value = row.get('Раздел документа', 'Неизвестно')
410
+
411
+ content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
412
+ content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
413
+ content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n" # Опечатка в названии колонки
414
+ content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
415
+ content += f"Раздел: {section_value}\n"
416
+ content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
417
+
418
+ doc = Document(
419
+ text=content,
420
+ metadata={
421
+ "type": "image",
422
+ "image_number": str(row.get('№ Изображения', 'unknown')),
423
+ "image_title": str(row.get('Название изображения', 'unknown')),
424
+ "image_description": str(row.get('Описание изображение', 'unknown')),
425
+ "document_id": str(row.get('Обозначение документа', 'unknown')),
426
+ "file_path": str(row.get('Файл изображения', 'unknown')),
427
+ "section": str(section_value),
428
+ "section_id": str(section_value)
429
+ }
430
+ )
431
+ image_documents.append(doc)
432
+
433
+ except Exception as e:
434
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
435
+ continue
436
+
437
+ log_message(f"Создано {len(image_documents)} документов из изображений")
438
+ return image_documents
439
+
440
+ except Exception as e:
441
+ log_message(f"Ошибка загрузки данных изображений: {str(e)}")
442
+ return []
443
+
444
+
445
+ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
446
+ log_message("Загружаю данные чанков из CSV")
447
+
448
+ try:
449
+ chunks_csv_path = hf_hub_download(
450
+ repo_id=repo_id,
451
+ filename=chunks_filename,
452
+ local_dir=download_dir,
453
+ repo_type="dataset",
454
+ token=hf_token
455
+ )
456
+
457
+ chunks_df = pd.read_csv(chunks_csv_path)
458
+ log_message(f"Загружено {len(chunks_df)} чанков из CSV")
459
+
460
+ text_column = None
461
+ for col in chunks_df.columns:
462
+ if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
463
+ text_column = col
464
+ break
465
+
466
+ if text_column is None:
467
+ text_column = chunks_df.columns[0]
468
+
469
+ log_message(f"Использую колонку: {text_column}")
470
+
471
+ documents = []
472
+ for i, (_, row) in enumerate(chunks_df.iterrows()):
473
+ doc = Document(
474
+ text=str(row[text_column]),
475
+ metadata={
476
+ "chunk_id": row.get('chunk_id', i),
477
+ "document_id": row.get('document_id', 'unknown'),
478
+ "type": "text"
479
+ }
480
+ )
481
+ documents.append(doc)
482
+
483
+ log_message(f"Создано {len(documents)} текстовых документов из CSV")
484
+ return documents, chunks_df
485
+
486
+ except Exception as e:
487
+ log_message(f"Ошибка загрузки CSV данных: {str(e)}")
488
+ return [], None