MrSimple01 commited on
Commit
7329ea6
·
verified ·
1 Parent(s): 7cc346c

adding new chunks info

Browse files
Files changed (1) hide show
  1. documents_prep.py +478 -410
documents_prep.py CHANGED
@@ -1,411 +1,479 @@
1
- import json
2
- import zipfile
3
- import pandas as pd
4
- from huggingface_hub import hf_hub_download, list_repo_files
5
- from llama_index.core import Document
6
- from my_logging import log_message
7
-
8
-
9
- def extract_text_from_json(data, document_id, document_name):
10
- documents = []
11
-
12
- if 'sections' in data:
13
- for section in data['sections']:
14
- section_id = section.get('section_id', 'Unknown')
15
- section_text = section.get('section_text', '')
16
-
17
- section_path = f"{section_id}"
18
- section_title = extract_section_title(section_text)
19
-
20
- if section_text.strip():
21
- doc = Document(
22
- text=section_text,
23
- metadata={
24
- "type": "text",
25
- "document_id": document_id,
26
- "document_name": document_name,
27
- "section_id": section_id,
28
- "section_text": section_title[:200],
29
- "section_path": section_path,
30
- "level": "section"
31
- }
32
- )
33
- documents.append(doc)
34
-
35
- if 'subsections' in section:
36
- for subsection in section['subsections']:
37
- subsection_id = subsection.get('subsection_id', 'Unknown')
38
- subsection_text = subsection.get('subsection_text', '')
39
- subsection_title = extract_section_title(subsection_text)
40
- subsection_path = f"{section_path}.{subsection_id}"
41
-
42
- if subsection_text.strip():
43
- doc = Document(
44
- text=subsection_text,
45
- metadata={
46
- "type": "text",
47
- "document_id": document_id,
48
- "document_name": document_name,
49
- "section_id": subsection_id,
50
- "section_text": subsection_title[:200],
51
- "section_path": subsection_path,
52
- "level": "subsection",
53
- "parent_section": section_id,
54
- "parent_title": section_title[:100]
55
- }
56
- )
57
- documents.append(doc)
58
-
59
- if 'sub_subsections' in subsection:
60
- for sub_subsection in subsection['sub_subsections']:
61
- sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
62
- sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
63
- sub_subsection_title = extract_section_title(sub_subsection_text)
64
- sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
65
-
66
- if sub_subsection_text.strip():
67
- doc = Document(
68
- text=sub_subsection_text,
69
- metadata={
70
- "type": "text",
71
- "document_id": document_id,
72
- "document_name": document_name,
73
- "section_id": sub_subsection_id,
74
- "section_text": sub_subsection_title[:200],
75
- "section_path": sub_subsection_path,
76
- "level": "sub_subsection",
77
- "parent_section": subsection_id,
78
- "parent_title": subsection_title[:100]
79
- }
80
- )
81
- documents.append(doc)
82
-
83
- if 'sub_sub_subsections' in sub_subsection:
84
- for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
85
- sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
86
- sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
87
- sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
88
-
89
- if sub_sub_subsection_text.strip():
90
- doc = Document(
91
- text=sub_sub_subsection_text,
92
- metadata={
93
- "type": "text",
94
- "document_id": document_id,
95
- "document_name": document_name,
96
- "section_id": sub_sub_subsection_id,
97
- "section_text": sub_sub_subsection_title[:200],
98
- "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
99
- "level": "sub_sub_subsection",
100
- "parent_section": sub_subsection_id,
101
- "parent_title": sub_subsection_title[:100]
102
- }
103
- )
104
- documents.append(doc)
105
-
106
- return documents
107
-
108
- def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
109
- log_message("Начинаю загрузку JSON документов")
110
-
111
- try:
112
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
113
- zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
114
- json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
115
-
116
- log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
117
-
118
- all_documents = []
119
-
120
- for zip_file_path in zip_files:
121
- try:
122
- log_message(f"Загружаю ZIP архив: {zip_file_path}")
123
- local_zip_path = hf_hub_download(
124
- repo_id=repo_id,
125
- filename=zip_file_path,
126
- local_dir=download_dir,
127
- repo_type="dataset",
128
- token=hf_token
129
- )
130
-
131
- documents = extract_zip_and_process_json(local_zip_path)
132
- all_documents.extend(documents)
133
-
134
- except Exception as e:
135
- log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
136
- continue
137
-
138
- for file_path in json_files:
139
- try:
140
- log_message(f"Обрабатываю прямой JSON файл: {file_path}")
141
- local_path = hf_hub_download(
142
- repo_id=repo_id,
143
- filename=file_path,
144
- local_dir=download_dir,
145
- repo_type="dataset",
146
- token=hf_token
147
- )
148
-
149
- with open(local_path, 'r', encoding='utf-8') as f:
150
- json_data = json.load(f)
151
-
152
- document_metadata = json_data.get('document_metadata', {})
153
- document_id = document_metadata.get('document_id', 'unknown')
154
- document_name = document_metadata.get('document_name', 'unknown')
155
-
156
- documents = extract_text_from_json(json_data, document_id, document_name)
157
- all_documents.extend(documents)
158
-
159
- log_message(f"Извлечено {len(documents)} документов из {file_path}")
160
-
161
- except Exception as e:
162
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
163
- continue
164
-
165
- log_message(f"Всего создано {len(all_documents)} текстовых документов")
166
- return all_documents
167
-
168
- except Exception as e:
169
- log_message(f"Ошибка загрузки JSON документов: {str(e)}")
170
- return []
171
-
172
-
173
- def extract_section_title(section_text):
174
- if not section_text.strip():
175
- return ""
176
-
177
- lines = section_text.strip().split('\n')
178
- first_line = lines[0].strip()
179
-
180
- if len(first_line) < 200 and not first_line.endswith('.'):
181
- return first_line
182
-
183
- # Otherwise, extract first sentence
184
- sentences = first_line.split('.')
185
- if len(sentences) > 1:
186
- return sentences[0].strip()
187
-
188
- return first_line[:100] + "..." if len(first_line) > 100 else first_line
189
-
190
- def extract_zip_and_process_json(zip_path):
191
- documents = []
192
-
193
- try:
194
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
195
- zip_files = zip_ref.namelist()
196
- json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
197
-
198
- log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
199
-
200
- for json_file in json_files:
201
- try:
202
- log_message(f"Обрабатываю файл из архива: {json_file}")
203
-
204
- with zip_ref.open(json_file) as f:
205
- json_data = json.load(f)
206
-
207
- document_metadata = json_data.get('document_metadata', {})
208
- document_id = document_metadata.get('document_id', 'unknown')
209
- document_name = document_metadata.get('document_name', 'unknown')
210
-
211
- docs = extract_text_from_json(json_data, document_id, document_name)
212
- documents.extend(docs)
213
-
214
- log_message(f"Извлечено {len(docs)} документов из {json_file}")
215
-
216
- except Exception as e:
217
- log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
218
- continue
219
-
220
- except Exception as e:
221
- log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
222
-
223
- return documents
224
-
225
- def table_to_document(table_data, document_id=None):
226
- content = ""
227
- if isinstance(table_data, dict):
228
- doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
229
-
230
- table_num = table_data.get('table_number', 'Неизвестно')
231
- table_title = table_data.get('table_title', 'Неизвестно')
232
- section = table_data.get('section', 'Неизвестно')
233
-
234
- content += f"Таблица: {table_num}\n"
235
- content += f"Название: {table_title}\n"
236
- content += f"Документ: {doc_id}\n"
237
- content += f"Раздел: {section}\n"
238
-
239
- if 'data' in table_data and isinstance(table_data['data'], list):
240
- for row in table_data['data']:
241
- if isinstance(row, dict):
242
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
243
- content += f"{row_text}\n"
244
-
245
- return Document(
246
- text=content,
247
- metadata={
248
- "type": "table",
249
- "table_number": table_data.get('table_number', 'unknown'),
250
- "table_title": table_data.get('table_title', 'unknown'),
251
- "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
252
- "section": table_data.get('section', 'unknown')
253
- }
254
- )
255
-
256
- def load_table_data(repo_id, hf_token, table_data_dir):
257
- log_message("Начинаю загрузку табличных данных")
258
-
259
- table_files = []
260
- try:
261
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
262
- for file in files:
263
- if file.startswith(table_data_dir) and file.endswith('.json'):
264
- table_files.append(file)
265
-
266
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
267
-
268
- table_documents = []
269
- for file_path in table_files:
270
- try:
271
- log_message(f"Обрабатываю файл: {file_path}")
272
- local_path = hf_hub_download(
273
- repo_id=repo_id,
274
- filename=file_path,
275
- local_dir='',
276
- repo_type="dataset",
277
- token=hf_token
278
- )
279
-
280
- with open(local_path, 'r', encoding='utf-8') as f:
281
- table_data = json.load(f)
282
-
283
- if isinstance(table_data, dict):
284
- document_id = table_data.get('document', 'unknown')
285
-
286
- if 'sheets' in table_data:
287
- for sheet in table_data['sheets']:
288
- sheet['document'] = document_id
289
- doc = table_to_document(sheet, document_id)
290
- table_documents.append(doc)
291
- else:
292
- doc = table_to_document(table_data, document_id)
293
- table_documents.append(doc)
294
- elif isinstance(table_data, list):
295
- for table_json in table_data:
296
- doc = table_to_document(table_json)
297
- table_documents.append(doc)
298
-
299
- except Exception as e:
300
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
301
- continue
302
-
303
- log_message(f"Создано {len(table_documents)} документов из таблиц")
304
- return table_documents
305
-
306
- except Exception as e:
307
- log_message(f"Ошибка загрузки табличных данных: {str(e)}")
308
- return []
309
-
310
- def load_image_data(repo_id, hf_token, image_data_dir):
311
- log_message("Начинаю загрузку данных изображений")
312
-
313
- image_files = []
314
- try:
315
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
316
- for file in files:
317
- if file.startswith(image_data_dir) and file.endswith('.csv'):
318
- image_files.append(file)
319
-
320
- log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
321
-
322
- image_documents = []
323
- for file_path in image_files:
324
- try:
325
- log_message(f"Обрабатываю файл изображений: {file_path}")
326
- local_path = hf_hub_download(
327
- repo_id=repo_id,
328
- filename=file_path,
329
- local_dir='',
330
- repo_type="dataset",
331
- token=hf_token
332
- )
333
-
334
- df = pd.read_csv(local_path)
335
- log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
336
-
337
- for _, row in df.iterrows():
338
- content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
339
- content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
340
- content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
341
- content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
342
- content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
343
- content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
344
-
345
- doc = Document(
346
- text=content,
347
- metadata={
348
- "type": "image",
349
- "image_number": row.get('№ Изображения', 'unknown'),
350
- "document_id": row.get('Обозначение документа', 'unknown'),
351
- "file_path": row.get('Файл изображения', 'unknown'),
352
- "section": row.get('Раздел документа', 'unknown')
353
- }
354
- )
355
- image_documents.append(doc)
356
-
357
- except Exception as e:
358
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
359
- continue
360
-
361
- log_message(f"Создано {len(image_documents)} документов из изображений")
362
- return image_documents
363
-
364
- except Exception as e:
365
- log_message(f"Ошибка загрузки данных изображений: {str(e)}")
366
- return []
367
-
368
- def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
369
- log_message("Загружаю данные чанков из CSV")
370
-
371
- try:
372
- chunks_csv_path = hf_hub_download(
373
- repo_id=repo_id,
374
- filename=chunks_filename,
375
- local_dir=download_dir,
376
- repo_type="dataset",
377
- token=hf_token
378
- )
379
-
380
- chunks_df = pd.read_csv(chunks_csv_path)
381
- log_message(f"Загружено {len(chunks_df)} чанков из CSV")
382
-
383
- text_column = None
384
- for col in chunks_df.columns:
385
- if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
386
- text_column = col
387
- break
388
-
389
- if text_column is None:
390
- text_column = chunks_df.columns[0]
391
-
392
- log_message(f"Использую колонку: {text_column}")
393
-
394
- documents = []
395
- for i, (_, row) in enumerate(chunks_df.iterrows()):
396
- doc = Document(
397
- text=str(row[text_column]),
398
- metadata={
399
- "chunk_id": row.get('chunk_id', i),
400
- "document_id": row.get('document_id', 'unknown'),
401
- "type": "text"
402
- }
403
- )
404
- documents.append(doc)
405
-
406
- log_message(f"Создано {len(documents)} текстовых документов из CSV")
407
- return documents, chunks_df
408
-
409
- except Exception as e:
410
- log_message(f"Ошибка загрузки CSV данных: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  return [], None
 
1
+ import json
2
+ import zipfile
3
+ import pandas as pd
4
+ from huggingface_hub import hf_hub_download, list_repo_files
5
+ from llama_index.core import Document
6
+ from my_logging import log_message
7
+ from llama_index.core.text_splitter import SentenceSplitter
8
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
9
+
10
+
11
+ def chunk_document(doc, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
12
+ text_splitter = SentenceSplitter(
13
+ chunk_size=chunk_size,
14
+ chunk_overlap=chunk_overlap,
15
+ separator=" "
16
+ )
17
+
18
+ text_chunks = text_splitter.split_text(doc.text)
19
+
20
+ chunked_docs = []
21
+ for i, chunk_text in enumerate(text_chunks):
22
+ chunk_metadata = doc.metadata.copy()
23
+ chunk_metadata.update({
24
+ "chunk_id": i,
25
+ "total_chunks": len(text_chunks),
26
+ "chunk_size": len(chunk_text),
27
+ "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
28
+ })
29
+
30
+ chunked_doc = Document(
31
+ text=chunk_text,
32
+ metadata=chunk_metadata
33
+ )
34
+ chunked_docs.append(chunked_doc)
35
+
36
+ return chunked_docs
37
+
38
+
39
+ def process_documents_with_chunking(documents):
40
+ all_chunked_docs = []
41
+ chunk_info = []
42
+
43
+ for doc in documents:
44
+ if len(doc.text) > CHUNK_SIZE:
45
+ chunked_docs = chunk_document(doc)
46
+ all_chunked_docs.extend(chunked_docs)
47
+
48
+ for i, chunk_doc in enumerate(chunked_docs):
49
+ chunk_info.append({
50
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
51
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
52
+ 'chunk_id': i,
53
+ 'chunk_size': len(chunk_doc.text),
54
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text
55
+ })
56
+ else:
57
+ all_chunked_docs.append(doc)
58
+ chunk_info.append({
59
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
60
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
61
+ 'chunk_id': 0,
62
+ 'chunk_size': len(doc.text),
63
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text
64
+ })
65
+
66
+ return all_chunked_docs, chunk_info
67
+
68
+ def extract_text_from_json(data, document_id, document_name):
69
+ documents = []
70
+
71
+ if 'sections' in data:
72
+ for section in data['sections']:
73
+ section_id = section.get('section_id', 'Unknown')
74
+ section_text = section.get('section_text', '')
75
+
76
+ section_path = f"{section_id}"
77
+ section_title = extract_section_title(section_text)
78
+
79
+ if section_text.strip():
80
+ doc = Document(
81
+ text=section_text,
82
+ metadata={
83
+ "type": "text",
84
+ "document_id": document_id,
85
+ "document_name": document_name,
86
+ "section_id": section_id,
87
+ "section_text": section_title[:200],
88
+ "section_path": section_path,
89
+ "level": "section"
90
+ }
91
+ )
92
+ documents.append(doc)
93
+
94
+ if 'subsections' in section:
95
+ for subsection in section['subsections']:
96
+ subsection_id = subsection.get('subsection_id', 'Unknown')
97
+ subsection_text = subsection.get('subsection_text', '')
98
+ subsection_title = extract_section_title(subsection_text)
99
+ subsection_path = f"{section_path}.{subsection_id}"
100
+
101
+ if subsection_text.strip():
102
+ doc = Document(
103
+ text=subsection_text,
104
+ metadata={
105
+ "type": "text",
106
+ "document_id": document_id,
107
+ "document_name": document_name,
108
+ "section_id": subsection_id,
109
+ "section_text": subsection_title[:200],
110
+ "section_path": subsection_path,
111
+ "level": "subsection",
112
+ "parent_section": section_id,
113
+ "parent_title": section_title[:100]
114
+ }
115
+ )
116
+ documents.append(doc)
117
+
118
+ if 'sub_subsections' in subsection:
119
+ for sub_subsection in subsection['sub_subsections']:
120
+ sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
121
+ sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
122
+ sub_subsection_title = extract_section_title(sub_subsection_text)
123
+ sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
124
+
125
+ if sub_subsection_text.strip():
126
+ doc = Document(
127
+ text=sub_subsection_text,
128
+ metadata={
129
+ "type": "text",
130
+ "document_id": document_id,
131
+ "document_name": document_name,
132
+ "section_id": sub_subsection_id,
133
+ "section_text": sub_subsection_title[:200],
134
+ "section_path": sub_subsection_path,
135
+ "level": "sub_subsection",
136
+ "parent_section": subsection_id,
137
+ "parent_title": subsection_title[:100]
138
+ }
139
+ )
140
+ documents.append(doc)
141
+
142
+ if 'sub_sub_subsections' in sub_subsection:
143
+ for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
144
+ sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
145
+ sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
146
+ sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
147
+
148
+ if sub_sub_subsection_text.strip():
149
+ doc = Document(
150
+ text=sub_sub_subsection_text,
151
+ metadata={
152
+ "type": "text",
153
+ "document_id": document_id,
154
+ "document_name": document_name,
155
+ "section_id": sub_sub_subsection_id,
156
+ "section_text": sub_sub_subsection_title[:200],
157
+ "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
158
+ "level": "sub_sub_subsection",
159
+ "parent_section": sub_subsection_id,
160
+ "parent_title": sub_subsection_title[:100]
161
+ }
162
+ )
163
+ documents.append(doc)
164
+
165
+ return documents
166
+
167
+ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
168
+ log_message("Начинаю загрузку JSON документов")
169
+
170
+ try:
171
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
172
+ zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
173
+ json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
174
+
175
+ log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
176
+
177
+ all_documents = []
178
+
179
+ for zip_file_path in zip_files:
180
+ try:
181
+ log_message(f"Загружаю ZIP архив: {zip_file_path}")
182
+ local_zip_path = hf_hub_download(
183
+ repo_id=repo_id,
184
+ filename=zip_file_path,
185
+ local_dir=download_dir,
186
+ repo_type="dataset",
187
+ token=hf_token
188
+ )
189
+
190
+ documents = extract_zip_and_process_json(local_zip_path)
191
+ all_documents.extend(documents)
192
+
193
+ except Exception as e:
194
+ log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
195
+ continue
196
+
197
+ for file_path in json_files:
198
+ try:
199
+ log_message(f"Обрабатываю прямой JSON файл: {file_path}")
200
+ local_path = hf_hub_download(
201
+ repo_id=repo_id,
202
+ filename=file_path,
203
+ local_dir=download_dir,
204
+ repo_type="dataset",
205
+ token=hf_token
206
+ )
207
+
208
+ with open(local_path, 'r', encoding='utf-8') as f:
209
+ json_data = json.load(f)
210
+
211
+ document_metadata = json_data.get('document_metadata', {})
212
+ document_id = document_metadata.get('document_id', 'unknown')
213
+ document_name = document_metadata.get('document_name', 'unknown')
214
+
215
+ documents = extract_text_from_json(json_data, document_id, document_name)
216
+ all_documents.extend(documents)
217
+
218
+ log_message(f"Извлечено {len(documents)} документов из {file_path}")
219
+
220
+ except Exception as e:
221
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
222
+ continue
223
+
224
+ chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
225
+
226
+ log_message(f"Всего создано {len(all_documents)} исходных документов")
227
+ log_message(f"После chunking получено {len(chunked_documents)} чанков")
228
+
229
+ return chunked_documents, chunk_info
230
+
231
+ except Exception as e:
232
+ log_message(f"Ошибка загрузки JSON документов: {str(e)}")
233
+ return [], []
234
+
235
+
236
+ def extract_section_title(section_text):
237
+ if not section_text.strip():
238
+ return ""
239
+
240
+ lines = section_text.strip().split('\n')
241
+ first_line = lines[0].strip()
242
+
243
+ if len(first_line) < 200 and not first_line.endswith('.'):
244
+ return first_line
245
+
246
+ # Otherwise, extract first sentence
247
+ sentences = first_line.split('.')
248
+ if len(sentences) > 1:
249
+ return sentences[0].strip()
250
+
251
+ return first_line[:100] + "..." if len(first_line) > 100 else first_line
252
+
253
+ def extract_zip_and_process_json(zip_path):
254
+ documents = []
255
+
256
+ try:
257
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
258
+ zip_files = zip_ref.namelist()
259
+ json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
260
+
261
+ log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
262
+
263
+ for json_file in json_files:
264
+ try:
265
+ log_message(f"Обрабатываю файл из архива: {json_file}")
266
+
267
+ with zip_ref.open(json_file) as f:
268
+ json_data = json.load(f)
269
+
270
+ document_metadata = json_data.get('document_metadata', {})
271
+ document_id = document_metadata.get('document_id', 'unknown')
272
+ document_name = document_metadata.get('document_name', 'unknown')
273
+
274
+ docs = extract_text_from_json(json_data, document_id, document_name)
275
+ documents.extend(docs)
276
+
277
+ log_message(f"Извлечено {len(docs)} документов из {json_file}")
278
+
279
+ except Exception as e:
280
+ log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
281
+ continue
282
+
283
+ except Exception as e:
284
+ log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
285
+
286
+ return documents
287
+
288
+ def table_to_document(table_data, document_id=None):
289
+ content = ""
290
+ if isinstance(table_data, dict):
291
+ doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
292
+
293
+ table_num = table_data.get('table_number', 'Неизвестно')
294
+ table_title = table_data.get('table_title', 'Неизвестно')
295
+ section = table_data.get('section', 'Неизвестно')
296
+
297
+ content += f"Таблица: {table_num}\n"
298
+ content += f"Название: {table_title}\n"
299
+ content += f"Документ: {doc_id}\n"
300
+ content += f"Раздел: {section}\n"
301
+
302
+ if 'data' in table_data and isinstance(table_data['data'], list):
303
+ for row in table_data['data']:
304
+ if isinstance(row, dict):
305
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
306
+ content += f"{row_text}\n"
307
+
308
+ return Document(
309
+ text=content,
310
+ metadata={
311
+ "type": "table",
312
+ "table_number": table_data.get('table_number', 'unknown'),
313
+ "table_title": table_data.get('table_title', 'unknown'),
314
+ "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
315
+ "section": table_data.get('section', 'unknown'),
316
+ "section_id": table_data.get('section', 'unknown')
317
+ }
318
+ )
319
+
320
+ def load_table_data(repo_id, hf_token, table_data_dir):
321
+ log_message("Начинаю загрузку табличных данных")
322
+
323
+ table_files = []
324
+ try:
325
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
326
+ for file in files:
327
+ if file.startswith(table_data_dir) and file.endswith('.json'):
328
+ table_files.append(file)
329
+
330
+ log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
331
+
332
+ table_documents = []
333
+ for file_path in table_files:
334
+ try:
335
+ log_message(f"Обрабатываю файл: {file_path}")
336
+ local_path = hf_hub_download(
337
+ repo_id=repo_id,
338
+ filename=file_path,
339
+ local_dir='',
340
+ repo_type="dataset",
341
+ token=hf_token
342
+ )
343
+
344
+ with open(local_path, 'r', encoding='utf-8') as f:
345
+ table_data = json.load(f)
346
+
347
+ if isinstance(table_data, dict):
348
+ document_id = table_data.get('document', 'unknown')
349
+
350
+ if 'sheets' in table_data:
351
+ for sheet in table_data['sheets']:
352
+ sheet['document'] = document_id
353
+ doc = table_to_document(sheet, document_id)
354
+ table_documents.append(doc)
355
+ else:
356
+ doc = table_to_document(table_data, document_id)
357
+ table_documents.append(doc)
358
+ elif isinstance(table_data, list):
359
+ for table_json in table_data:
360
+ doc = table_to_document(table_json)
361
+ table_documents.append(doc)
362
+
363
+ except Exception as e:
364
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
365
+ continue
366
+
367
+ log_message(f"Создано {len(table_documents)} документов из таблиц")
368
+ return table_documents
369
+
370
+ except Exception as e:
371
+ log_message(f"Ошибка загрузки табличных данных: {str(e)}")
372
+ return []
373
+
374
+ def load_image_data(repo_id, hf_token, image_data_dir):
375
+ log_message("Начинаю загрузку данных изображений")
376
+
377
+ image_files = []
378
+ try:
379
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
380
+ for file in files:
381
+ if file.startswith(image_data_dir) and file.endswith('.csv'):
382
+ image_files.append(file)
383
+
384
+ log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
385
+
386
+ image_documents = []
387
+ for file_path in image_files:
388
+ try:
389
+ log_message(f"Обрабатываю файл изображений: {file_path}")
390
+ local_path = hf_hub_download(
391
+ repo_id=repo_id,
392
+ filename=file_path,
393
+ local_dir='',
394
+ repo_type="dataset",
395
+ token=hf_token
396
+ )
397
+
398
+ df = pd.read_csv(local_path)
399
+ log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
400
+
401
+ for _, row in df.iterrows():
402
+ section_value = row.get('Раздел документа', row.get('section', 'Неизвестно'))
403
+
404
+ content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
405
+ content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
406
+ content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
407
+ content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
408
+ content += f"Раздел: {section_value}\n"
409
+ content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
410
+
411
+ doc = Document(
412
+ text=content,
413
+ metadata={
414
+ "type": "image",
415
+ "image_number": row.get('№ Изображения', 'unknown'),
416
+ "document_id": row.get('Обозначение документа', 'unknown'),
417
+ "file_path": row.get('Файл изображения', 'unknown'),
418
+ "section": section_value,
419
+ "section_id": section_value
420
+ }
421
+ )
422
+ image_documents.append(doc)
423
+
424
+ except Exception as e:
425
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
426
+ continue
427
+
428
+ log_message(f"Создано {len(image_documents)} документов из изображений")
429
+ return image_documents
430
+
431
+ except Exception as e:
432
+ log_message(f"Ошибка загрузки данных изображений: {str(e)}")
433
+ return []
434
+
435
+
436
+ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
437
+ log_message("Загружаю данные чанков из CSV")
438
+
439
+ try:
440
+ chunks_csv_path = hf_hub_download(
441
+ repo_id=repo_id,
442
+ filename=chunks_filename,
443
+ local_dir=download_dir,
444
+ repo_type="dataset",
445
+ token=hf_token
446
+ )
447
+
448
+ chunks_df = pd.read_csv(chunks_csv_path)
449
+ log_message(f"Загружено {len(chunks_df)} чанков из CSV")
450
+
451
+ text_column = None
452
+ for col in chunks_df.columns:
453
+ if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
454
+ text_column = col
455
+ break
456
+
457
+ if text_column is None:
458
+ text_column = chunks_df.columns[0]
459
+
460
+ log_message(f"Использую колонку: {text_column}")
461
+
462
+ documents = []
463
+ for i, (_, row) in enumerate(chunks_df.iterrows()):
464
+ doc = Document(
465
+ text=str(row[text_column]),
466
+ metadata={
467
+ "chunk_id": row.get('chunk_id', i),
468
+ "document_id": row.get('document_id', 'unknown'),
469
+ "type": "text"
470
+ }
471
+ )
472
+ documents.append(doc)
473
+
474
+ log_message(f"Создано {len(documents)} текстовых документов из CSV")
475
+ return documents, chunks_df
476
+
477
+ except Exception as e:
478
+ log_message(f"Ошибка загрузки CSV данных: {str(e)}")
479
  return [], None