MrSimple07 commited on
Commit
5fc122f
·
1 Parent(s): f0cb4f3

new documents_prep

Browse files
Files changed (1) hide show
  1. documents_prep.py +173 -287
documents_prep.py CHANGED
@@ -1,12 +1,50 @@
1
  import json
2
  import zipfile
 
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
6
  from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
12
  log_message(f"Загрузка JSON документов из {json_files_dir}")
@@ -15,27 +53,27 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
15
  chunk_info = []
16
 
17
  try:
18
- files = list_repo_files(repo_id, token=hf_token)
19
  zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
20
 
21
- log_message(f"Найдено {len(zip_files)} ZIP файлов")
22
 
23
  for zip_file in zip_files:
 
 
24
  zip_path = hf_hub_download(
25
  repo_id=repo_id,
26
  filename=zip_file,
27
- token=hf_token,
28
  repo_type="dataset",
29
- local_dir=download_dir
30
  )
31
 
32
- log_message(f"Обрабатываю архив: {zip_file}")
33
-
34
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
35
  json_files = [f for f in zip_ref.namelist()
36
  if f.endswith('.json') and not f.startswith('__MACOSX')]
37
 
38
- log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
39
 
40
  for json_file in json_files:
41
  try:
@@ -45,68 +83,60 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
45
  doc_id = json_data.get('document_id', os.path.basename(json_file))
46
  sections = json_data.get('sections', [])
47
 
48
- log_message(f"Обработка документа {doc_id}: {len(sections)} разделов")
49
-
50
  for section in sections:
51
- doc, info = process_text_section(section, doc_id)
52
- if doc:
53
- documents.append(doc)
54
- chunk_info.append(info)
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  except Exception as e:
57
- log_message(f"Ошибка при обработке {json_file}: {str(e)}")
 
 
 
 
 
 
 
 
58
 
59
- log_message(f"Загружено {len(documents)} текстовых документов")
60
- return documents, chunk_info
61
-
62
  except Exception as e:
63
  log_message(f"Ошибка загрузки JSON: {str(e)}")
64
  return [], []
65
 
66
- def process_text_section(section, doc_id):
67
- section_id = section.get('section_id', 'unknown')
68
- section_path = section.get('section_path', '')
69
- section_text = section.get('section_text', '')
70
- section_content = section.get('section_content', '')
71
- parent_section = section.get('parent_section', '')
72
- parent_title = section.get('parent_title', '')
73
- level = section.get('level', 'section')
74
-
75
- full_text = f"{section_text}\n{section_content}".strip()
76
 
77
- if not full_text:
78
- return None, None
 
79
 
80
- metadata = {
81
- 'document_id': doc_id,
82
- 'section_id': section_id,
83
- 'section_path': section_path,
84
- 'section_text': section_text,
85
- 'parent_section': parent_section,
86
- 'parent_title': parent_title,
87
- 'level': level,
88
- 'type': 'text',
89
- 'chunk_text': full_text
90
- }
91
 
92
- doc = Document(
93
- text=full_text,
94
- metadata=metadata
95
- )
96
 
97
- chunk_info = {
98
- 'document_id': doc_id,
99
- 'section_id': section_id,
100
- 'section_path': section_path,
101
- 'section_text': section_text,
102
- 'parent_section': parent_section,
103
- 'parent_title': parent_title,
104
- 'level': level,
105
- 'type': 'text',
106
- 'chunk_text': full_text
107
- }
108
 
109
- return doc, chunk_info
110
 
111
  def load_table_data(repo_id, hf_token, table_data_dir):
112
  log_message(f"Загрузка табличных данных из {table_data_dir}")
@@ -114,299 +144,155 @@ def load_table_data(repo_id, hf_token, table_data_dir):
114
  documents = []
115
 
116
  try:
117
- files = list_repo_files(repo_id, token=hf_token)
118
- json_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
119
 
120
- log_message(f"Найдено {len(json_files)} табличных JSON файлов")
121
 
122
- for json_file in json_files:
123
  try:
124
  file_path = hf_hub_download(
125
  repo_id=repo_id,
126
- filename=json_file,
127
- token=hf_token,
128
- repo_type="dataset"
129
  )
130
 
131
  with open(file_path, 'r', encoding='utf-8') as f:
132
  table_data = json.load(f)
133
 
134
- doc = create_table_document(table_data)
135
- if doc:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  documents.append(doc)
137
 
138
  except Exception as e:
139
- log_message(f"Ошибка при обработке таблицы {json_file}: {str(e)}")
140
 
141
  log_message(f"Загружено {len(documents)} табличных документов")
142
  return documents
143
-
144
  except Exception as e:
145
  log_message(f"Ошибка загрузки таблиц: {str(e)}")
146
  return []
147
 
148
- def create_table_document(table_data):
149
- doc_id = table_data.get('document_id', 'unknown')
150
- table_number = table_data.get('table_number', 'unknown')
151
- table_title = table_data.get('table_title', '')
152
- section = table_data.get('section', '')
153
- headers = table_data.get('headers', [])
154
- data = table_data.get('data', [])
155
-
156
- if not data:
157
- return None
158
-
159
- token_count = estimate_tokens(str(table_data))
160
-
161
- if token_count < 2000:
162
- text = format_table_as_text(table_number, table_title, section, headers, data)
163
-
164
- metadata = {
165
- 'document_id': doc_id,
166
- 'table_number': table_number,
167
- 'table_title': table_title,
168
- 'section': section,
169
- 'type': 'table',
170
- 'headers': str(headers),
171
- 'row_count': len(data)
172
- }
173
-
174
- return Document(text=text, metadata=metadata)
175
- else:
176
- return create_chunked_table_document(
177
- doc_id, table_number, table_title, section, headers, data
178
- )
179
-
180
- def create_chunked_table_document(doc_id, table_number, table_title, section, headers, data, rows_per_chunk=30):
181
- chunks = []
182
-
183
- for i in range(0, len(data), rows_per_chunk):
184
- chunk_rows = data[i:i+rows_per_chunk]
185
-
186
- text = format_table_as_text(
187
- table_number,
188
- table_title,
189
- section,
190
- headers,
191
- chunk_rows,
192
- chunk_info=f"строки {i+1}-{i+len(chunk_rows)}"
193
- )
194
-
195
- metadata = {
196
- 'document_id': doc_id,
197
- 'table_number': table_number,
198
- 'table_title': table_title,
199
- 'section': section,
200
- 'type': 'table',
201
- 'headers': str(headers),
202
- 'chunk_index': i // rows_per_chunk,
203
- 'row_start': i,
204
- 'row_end': i + len(chunk_rows),
205
- 'row_count': len(chunk_rows)
206
- }
207
-
208
- chunks.append(Document(text=text, metadata=metadata))
209
-
210
- return chunks[0] if len(chunks) == 1 else chunks
211
-
212
- def format_table_as_text(table_number, table_title, section, headers, data, chunk_info=""):
213
- text_parts = []
214
-
215
- text_parts.append(f"Таблица {table_number}")
216
- if table_title:
217
- text_parts.append(f"Название: {table_title}")
218
- if section:
219
- text_parts.append(f"Раздел: {section}")
220
- if chunk_info:
221
- text_parts.append(f"({chunk_info})")
222
-
223
- text_parts.append(f"\nЗаголовки: {', '.join(headers)}")
224
- text_parts.append("\nДанные:")
225
-
226
- for row in data[:100]:
227
- row_text = " | ".join([str(cell) for cell in row])
228
- text_parts.append(row_text)
229
-
230
- return "\n".join(text_parts)
231
-
232
  def load_image_data(repo_id, hf_token, image_data_dir):
233
  log_message(f"Загрузка данных изображений из {image_data_dir}")
234
 
235
  documents = []
236
 
237
  try:
238
- files = list_repo_files(repo_id, token=hf_token)
239
- json_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.json')]
240
 
241
- log_message(f"Найдено {len(json_files)} JSON файлов изображений")
242
 
243
- for json_file in json_files:
244
  try:
245
  file_path = hf_hub_download(
246
  repo_id=repo_id,
247
- filename=json_file,
248
- token=hf_token,
249
- repo_type="dataset"
250
  )
251
 
252
  with open(file_path, 'r', encoding='utf-8') as f:
253
  image_data = json.load(f)
254
 
255
- doc = create_image_document(image_data)
256
- if doc:
257
- documents.append(doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  except Exception as e:
260
- log_message(f"Ошибка при обработке изображения {json_file}: {str(e)}")
261
 
262
  log_message(f"Загружено {len(documents)} документов изображений")
263
  return documents
264
-
265
  except Exception as e:
266
  log_message(f"Ошибка загрузки изображений: {str(e)}")
267
  return []
268
 
269
- def create_image_document(image_data):
270
- doc_id = image_data.get('document_id', 'unknown')
271
- image_number = image_data.get('image_number', 'unknown')
272
- image_title = image_data.get('image_title', '')
273
- image_description = image_data.get('image_description', '')
274
- section = image_data.get('section', '')
275
-
276
- text_parts = []
277
- text_parts.append(f"Рисунок {image_number}")
278
- if image_title:
279
- text_parts.append(f"Название: {image_title}")
280
- if section:
281
- text_parts.append(f"Раздел: {section}")
282
- if image_description:
283
- text_parts.append(f"Описание: {image_description}")
284
-
285
- text = "\n".join(text_parts)
286
-
287
- metadata = {
288
- 'document_id': doc_id,
289
- 'image_number': image_number,
290
- 'image_title': image_title,
291
- 'section': section,
292
- 'type': 'image'
293
- }
294
-
295
- return Document(text=text, metadata=metadata)
296
-
297
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
298
  log_message(f"Загрузка CSV чанков из {chunks_filename}")
299
 
 
 
 
300
  try:
301
  csv_path = hf_hub_download(
302
  repo_id=repo_id,
303
  filename=chunks_filename,
304
- token=hf_token,
305
  repo_type="dataset",
306
- local_dir=download_dir
307
  )
308
 
309
- df = pd.read_csv(csv_path)
310
- log_message(f"Загружено {len(df)} строк из CSV")
311
 
312
- documents = []
313
- for _, row in df.iterrows():
 
 
 
314
  metadata = {
315
  'document_id': row.get('document_id', 'unknown'),
316
- 'section_id': row.get('section_id', 'unknown'),
317
  'section_path': row.get('section_path', ''),
318
  'type': 'text'
319
  }
320
 
321
- text = row.get('chunk_text', '')
322
- if text:
323
- doc = Document(text=text, metadata=metadata)
324
- documents.append(doc)
325
 
326
  log_message(f"Создано {len(documents)} документов из CSV")
327
- return documents, df
328
-
329
  except Exception as e:
330
  log_message(f"Ошибка загрузки CSV: {str(e)}")
331
- return [], None
332
-
333
- def process_documents_with_chunking(documents):
334
- log_message(f"Чанкинг {len(documents)} документов")
335
-
336
- text_splitter = SentenceSplitter(
337
- chunk_size=CHUNK_SIZE,
338
- chunk_overlap=CHUNK_OVERLAP,
339
- separator=" ",
340
- backup_separators=["\n", ".", "!", "?"]
341
- )
342
-
343
- chunked_documents = []
344
- chunk_info = []
345
-
346
- for doc in documents:
347
- doc_type = doc.metadata.get('type', 'text')
348
-
349
- if doc_type == 'table':
350
- if isinstance(doc, list):
351
- chunked_documents.extend(doc)
352
- for d in doc:
353
- chunk_info.append(create_chunk_info(d))
354
- else:
355
- chunked_documents.append(doc)
356
- chunk_info.append(create_chunk_info(doc))
357
-
358
- elif doc_type == 'image':
359
- chunked_documents.append(doc)
360
- chunk_info.append(create_chunk_info(doc))
361
-
362
- else:
363
- token_count = estimate_tokens(doc.text)
364
-
365
- if token_count <= CHUNK_SIZE:
366
- chunked_documents.append(doc)
367
- chunk_info.append(create_chunk_info(doc))
368
- else:
369
- nodes = text_splitter.get_nodes_from_documents([doc])
370
-
371
- for node in nodes:
372
- new_doc = Document(
373
- text=node.text,
374
- metadata=doc.metadata
375
- )
376
- chunked_documents.append(new_doc)
377
- chunk_info.append(create_chunk_info(new_doc))
378
-
379
- log_message(f"Получено {len(chunked_documents)} чанков после обработки")
380
- return chunked_documents, chunk_info
381
-
382
- def create_chunk_info(doc):
383
- metadata = doc.metadata
384
-
385
- info = {
386
- 'document_id': metadata.get('document_id', 'unknown'),
387
- 'type': metadata.get('type', 'text'),
388
- 'chunk_text': doc.text[:500]
389
- }
390
-
391
- if metadata.get('type') == 'table':
392
- info['table_number'] = metadata.get('table_number', 'unknown')
393
- info['table_title'] = metadata.get('table_title', '')
394
- info['section'] = metadata.get('section', '')
395
-
396
- elif metadata.get('type') == 'image':
397
- info['image_number'] = metadata.get('image_number', 'unknown')
398
- info['image_title'] = metadata.get('image_title', '')
399
- info['section'] = metadata.get('section', '')
400
-
401
- else:
402
- info['section_id'] = metadata.get('section_id', 'unknown')
403
- info['section_path'] = metadata.get('section_path', '')
404
- info['section_text'] = metadata.get('section_text', '')
405
- info['parent_section'] = metadata.get('parent_section', '')
406
- info['parent_title'] = metadata.get('parent_title', '')
407
- info['level'] = metadata.get('level', 'section')
408
-
409
- return info
410
-
411
- def estimate_tokens(text):
412
- return len(text.split()) * 1.3
 
1
  import json
2
  import zipfile
3
+ import os
4
  import pandas as pd
5
  from huggingface_hub import hf_hub_download, list_repo_files
6
  from llama_index.core import Document
7
  from llama_index.core.text_splitter import SentenceSplitter
8
  from my_logging import log_message
9
  from config import CHUNK_SIZE, CHUNK_OVERLAP
10
+
11
+ def process_documents_with_chunking(documents):
12
+ if not documents:
13
+ return [], []
14
+
15
+ log_message(f"Чанкинг {len(documents)} документов")
16
+
17
+ text_splitter = SentenceSplitter(
18
+ chunk_size=CHUNK_SIZE,
19
+ chunk_overlap=CHUNK_OVERLAP
20
+ )
21
+
22
+ chunked_docs = []
23
+ chunk_info = []
24
+
25
+ for doc in documents:
26
+ chunks = text_splitter.get_nodes_from_documents([doc])
27
+
28
+ for chunk in chunks:
29
+ chunked_docs.append(chunk)
30
+
31
+ metadata = doc.metadata.copy()
32
+ chunk_info.append({
33
+ 'document_id': metadata.get('document_id', 'unknown'),
34
+ 'section_id': metadata.get('section_id', 'unknown'),
35
+ 'section_path': metadata.get('section_path', ''),
36
+ 'section_text': metadata.get('section_text', ''),
37
+ 'parent_section': metadata.get('parent_section', ''),
38
+ 'parent_title': metadata.get('parent_title', ''),
39
+ 'level': metadata.get('level', ''),
40
+ 'chunk_text': chunk.text,
41
+ 'type': metadata.get('type', 'text'),
42
+ 'table_number': metadata.get('table_number', ''),
43
+ 'image_number': metadata.get('image_number', '')
44
+ })
45
+
46
+ log_message(f"Создано {len(chunked_docs)} чанков")
47
+ return chunked_docs, chunk_info
48
 
49
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
50
  log_message(f"Загрузка JSON документов из {json_files_dir}")
 
53
  chunk_info = []
54
 
55
  try:
56
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
57
  zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
58
 
59
+ log_message(f"Найдено {len(zip_files)} ZIP архивов")
60
 
61
  for zip_file in zip_files:
62
+ log_message(f"Загружаю архив: {zip_file}")
63
+
64
  zip_path = hf_hub_download(
65
  repo_id=repo_id,
66
  filename=zip_file,
67
+ local_dir=download_dir,
68
  repo_type="dataset",
69
+ token=hf_token
70
  )
71
 
 
 
72
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
73
  json_files = [f for f in zip_ref.namelist()
74
  if f.endswith('.json') and not f.startswith('__MACOSX')]
75
 
76
+ log_message(f"Найдено {len(json_files)} JSON файлов в {zip_file}")
77
 
78
  for json_file in json_files:
79
  try:
 
83
  doc_id = json_data.get('document_id', os.path.basename(json_file))
84
  sections = json_data.get('sections', [])
85
 
 
 
86
  for section in sections:
87
+ text = section.get('text', '').strip()
88
+ if not text:
89
+ continue
90
+
91
+ metadata = {
92
+ 'document_id': doc_id,
93
+ 'section_id': section.get('section_id', ''),
94
+ 'section_path': section.get('section_path', ''),
95
+ 'section_text': section.get('section_text', ''),
96
+ 'parent_section': section.get('parent_section', ''),
97
+ 'parent_title': section.get('parent_title', ''),
98
+ 'level': section.get('level', ''),
99
+ 'type': 'text'
100
+ }
101
+
102
+ doc = Document(text=text, metadata=metadata)
103
+ documents.append(doc)
104
 
105
  except Exception as e:
106
+ log_message(f"Ошибка обработки {json_file}: {str(e)}")
107
+
108
+ log_message(f"Всего загружено {len(documents)} текстовых документов")
109
+
110
+ if documents:
111
+ chunked_docs, chunk_info = process_documents_with_chunking(documents)
112
+ return chunked_docs, chunk_info
113
+
114
+ return [], []
115
 
 
 
 
116
  except Exception as e:
117
  log_message(f"Ошибка загрузки JSON: {str(e)}")
118
  return [], []
119
 
120
+ def chunk_large_table(table_text, table_number, table_title, doc_id, max_tokens=1500):
121
+ chunks = []
 
 
 
 
 
 
 
 
122
 
123
+ lines = table_text.split('\n')
124
+ header_lines = [l for l in lines[:5] if l.strip()]
125
+ data_lines = [l for l in lines if l.strip() and l not in header_lines]
126
 
127
+ if len(table_text) < max_tokens:
128
+ return [table_text]
 
 
 
 
 
 
 
 
 
129
 
130
+ chunk_size = max(30, len(data_lines) // ((len(table_text) // max_tokens) + 1))
 
 
 
131
 
132
+ for i in range(0, len(data_lines), chunk_size):
133
+ chunk_data = data_lines[i:i+chunk_size]
134
+ chunk_text = f"Таблица {table_number} - {table_title}\n"
135
+ chunk_text += '\n'.join(header_lines) + '\n'
136
+ chunk_text += '\n'.join(chunk_data)
137
+ chunks.append(chunk_text)
 
 
 
 
 
138
 
139
+ return chunks
140
 
141
  def load_table_data(repo_id, hf_token, table_data_dir):
142
  log_message(f"Загрузка табличных данных из {table_data_dir}")
 
144
  documents = []
145
 
146
  try:
147
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
148
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
149
 
150
+ log_message(f"Найдено {len(table_files)} файлов таблиц")
151
 
152
+ for table_file in table_files:
153
  try:
154
  file_path = hf_hub_download(
155
  repo_id=repo_id,
156
+ filename=table_file,
157
+ repo_type="dataset",
158
+ token=hf_token
159
  )
160
 
161
  with open(file_path, 'r', encoding='utf-8') as f:
162
  table_data = json.load(f)
163
 
164
+ doc_id = table_data.get('document_id', '')
165
+ table_number = table_data.get('table_number', '')
166
+ table_title = table_data.get('table_title', '')
167
+
168
+ table_text = f"Таблица {table_number} - {table_title}\n"
169
+
170
+ if 'headers' in table_data:
171
+ table_text += "Заголовки: " + " | ".join(table_data['headers']) + "\n"
172
+
173
+ if 'data' in table_data:
174
+ for row in table_data['data']:
175
+ if isinstance(row, list):
176
+ table_text += " | ".join(str(cell) for cell in row) + "\n"
177
+ elif isinstance(row, dict):
178
+ table_text += " | ".join(f"{k}: {v}" for k, v in row.items()) + "\n"
179
+
180
+ chunks = chunk_large_table(table_text, table_number, table_title, doc_id)
181
+
182
+ for idx, chunk_text in enumerate(chunks):
183
+ metadata = {
184
+ 'document_id': doc_id,
185
+ 'table_number': table_number,
186
+ 'table_title': table_title,
187
+ 'type': 'table',
188
+ 'chunk_index': idx,
189
+ 'section_id': f"table_{table_number}",
190
+ 'section_path': f"Таблица {table_number}"
191
+ }
192
+
193
+ doc = Document(text=chunk_text, metadata=metadata)
194
  documents.append(doc)
195
 
196
  except Exception as e:
197
+ log_message(f"Ошибка обработки таблицы {table_file}: {str(e)}")
198
 
199
  log_message(f"Загружено {len(documents)} табличных документов")
200
  return documents
201
+
202
  except Exception as e:
203
  log_message(f"Ошибка загрузки таблиц: {str(e)}")
204
  return []
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def load_image_data(repo_id, hf_token, image_data_dir):
207
  log_message(f"Загрузка данных изображений из {image_data_dir}")
208
 
209
  documents = []
210
 
211
  try:
212
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
213
+ image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.json')]
214
 
215
+ log_message(f"Найдено {len(image_files)} файлов изображений")
216
 
217
+ for image_file in image_files:
218
  try:
219
  file_path = hf_hub_download(
220
  repo_id=repo_id,
221
+ filename=image_file,
222
+ repo_type="dataset",
223
+ token=hf_token
224
  )
225
 
226
  with open(file_path, 'r', encoding='utf-8') as f:
227
  image_data = json.load(f)
228
 
229
+ doc_id = image_data.get('document_id', '')
230
+ image_number = image_data.get('image_number', '')
231
+ image_title = image_data.get('image_title', '')
232
+ image_description = image_data.get('image_description', '')
233
+
234
+ text = f"Рисунок {image_number} - {image_title}\n"
235
+ if image_description:
236
+ text += f"Описание: {image_description}"
237
+
238
+ metadata = {
239
+ 'document_id': doc_id,
240
+ 'image_number': image_number,
241
+ 'image_title': image_title,
242
+ 'type': 'image',
243
+ 'section_id': f"image_{image_number}",
244
+ 'section_path': f"Рисунок {image_number}"
245
+ }
246
+
247
+ doc = Document(text=text, metadata=metadata)
248
+ documents.append(doc)
249
 
250
  except Exception as e:
251
+ log_message(f"Ошибка обработки изображения {image_file}: {str(e)}")
252
 
253
  log_message(f"Загружено {len(documents)} документов изображений")
254
  return documents
255
+
256
  except Exception as e:
257
  log_message(f"Ошибка загрузки изображений: {str(e)}")
258
  return []
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
261
  log_message(f"Загрузка CSV чанков из {chunks_filename}")
262
 
263
+ documents = []
264
+ chunks_df = None
265
+
266
  try:
267
  csv_path = hf_hub_download(
268
  repo_id=repo_id,
269
  filename=chunks_filename,
270
+ local_dir=download_dir,
271
  repo_type="dataset",
272
+ token=hf_token
273
  )
274
 
275
+ chunks_df = pd.read_csv(csv_path)
276
+ log_message(f"Загружено {len(chunks_df)} строк из CSV")
277
 
278
+ for _, row in chunks_df.iterrows():
279
+ text = row.get('chunk_text', '')
280
+ if not text:
281
+ continue
282
+
283
  metadata = {
284
  'document_id': row.get('document_id', 'unknown'),
285
+ 'section_id': row.get('section_id', ''),
286
  'section_path': row.get('section_path', ''),
287
  'type': 'text'
288
  }
289
 
290
+ doc = Document(text=text, metadata=metadata)
291
+ documents.append(doc)
 
 
292
 
293
  log_message(f"Создано {len(documents)} документов из CSV")
294
+ return documents, chunks_df
295
+
296
  except Exception as e:
297
  log_message(f"Ошибка загрузки CSV: {str(e)}")
298
+ return [], None