MrSimple07 commited on
Commit
a33029f
·
1 Parent(s): 5fc122f

new documents_prep

Browse files
Files changed (1) hide show
  1. documents_prep.py +442 -247
documents_prep.py CHANGED
@@ -1,298 +1,493 @@
1
  import json
2
  import zipfile
3
- import os
4
  import pandas as pd
 
5
  from huggingface_hub import hf_hub_download, list_repo_files
6
  from llama_index.core import Document
7
  from llama_index.core.text_splitter import SentenceSplitter
8
  from my_logging import log_message
9
  from config import CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
- def process_documents_with_chunking(documents):
12
- if not documents:
13
- return [], []
14
-
15
- log_message(f"Чанкинг {len(documents)} документов")
16
-
17
- text_splitter = SentenceSplitter(
 
18
  chunk_size=CHUNK_SIZE,
19
- chunk_overlap=CHUNK_OVERLAP
 
20
  )
21
 
22
- chunked_docs = []
23
- chunk_info = []
24
 
25
- for doc in documents:
26
- chunks = text_splitter.get_nodes_from_documents([doc])
 
 
 
 
 
 
27
 
28
- for chunk in chunks:
29
- chunked_docs.append(chunk)
30
-
31
- metadata = doc.metadata.copy()
32
- chunk_info.append({
33
- 'document_id': metadata.get('document_id', 'unknown'),
34
- 'section_id': metadata.get('section_id', 'unknown'),
35
- 'section_path': metadata.get('section_path', ''),
36
- 'section_text': metadata.get('section_text', ''),
37
- 'parent_section': metadata.get('parent_section', ''),
38
- 'parent_title': metadata.get('parent_title', ''),
39
- 'level': metadata.get('level', ''),
40
- 'chunk_text': chunk.text,
41
- 'type': metadata.get('type', 'text'),
42
- 'table_number': metadata.get('table_number', ''),
43
- 'image_number': metadata.get('image_number', '')
44
- })
45
-
46
- log_message(f"Создано {len(chunked_docs)} чанков")
47
- return chunked_docs, chunk_info
48
 
49
- def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
50
- log_message(f"Загрузка JSON документов из {json_files_dir}")
 
 
 
 
 
 
 
51
 
52
- documents = []
53
- chunk_info = []
54
 
55
- try:
56
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
57
- zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
58
-
59
- log_message(f"Найдено {len(zip_files)} ZIP архивов")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- for zip_file in zip_files:
62
- log_message(f"Загружаю архив: {zip_file}")
 
 
63
 
64
- zip_path = hf_hub_download(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  repo_id=repo_id,
66
- filename=zip_file,
67
- local_dir=download_dir,
68
  repo_type="dataset",
69
  token=hf_token
70
  )
71
 
72
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
73
- json_files = [f for f in zip_ref.namelist()
74
- if f.endswith('.json') and not f.startswith('__MACOSX')]
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- log_message(f"Найдено {len(json_files)} JSON файлов в {zip_file}")
 
 
 
77
 
78
- for json_file in json_files:
79
- try:
80
- with zip_ref.open(json_file) as f:
81
- json_data = json.load(f)
82
-
83
- doc_id = json_data.get('document_id', os.path.basename(json_file))
84
- sections = json_data.get('sections', [])
85
-
86
- for section in sections:
87
- text = section.get('text', '').strip()
88
- if not text:
89
- continue
90
-
91
- metadata = {
92
- 'document_id': doc_id,
93
- 'section_id': section.get('section_id', ''),
94
- 'section_path': section.get('section_path', ''),
95
- 'section_text': section.get('section_text', ''),
96
- 'parent_section': section.get('parent_section', ''),
97
- 'parent_title': section.get('parent_title', ''),
98
- 'level': section.get('level', ''),
99
- 'type': 'text'
100
- }
101
-
102
- doc = Document(text=text, metadata=metadata)
103
- documents.append(doc)
104
-
105
- except Exception as e:
106
- log_message(f"Ошибка обработки {json_file}: {str(e)}")
107
-
108
- log_message(f"Всего загружено {len(documents)} текстовых документов")
109
-
110
- if documents:
111
- chunked_docs, chunk_info = process_documents_with_chunking(documents)
112
- return chunked_docs, chunk_info
113
-
114
- return [], []
115
 
116
- except Exception as e:
117
- log_message(f"Ошибка загрузки JSON: {str(e)}")
118
- return [], []
119
-
120
- def chunk_large_table(table_text, table_number, table_title, doc_id, max_tokens=1500):
121
- chunks = []
122
 
123
- lines = table_text.split('\n')
124
- header_lines = [l for l in lines[:5] if l.strip()]
125
- data_lines = [l for l in lines if l.strip() and l not in header_lines]
 
 
 
 
 
126
 
127
- if len(table_text) < max_tokens:
128
- return [table_text]
 
 
129
 
130
- chunk_size = max(30, len(data_lines) // ((len(table_text) // max_tokens) + 1))
 
 
 
 
 
 
 
 
 
 
131
 
132
- for i in range(0, len(data_lines), chunk_size):
133
- chunk_data = data_lines[i:i+chunk_size]
134
- chunk_text = f"Таблица {table_number} - {table_title}\n"
135
- chunk_text += '\n'.join(header_lines) + '\n'
136
- chunk_text += '\n'.join(chunk_data)
137
- chunks.append(chunk_text)
138
 
139
- return chunks
140
-
141
- def load_table_data(repo_id, hf_token, table_data_dir):
142
- log_message(f"Загрузка табличных данных из {table_data_dir}")
143
 
 
 
 
 
 
 
 
 
 
 
144
  documents = []
145
 
146
- try:
147
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
148
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
 
 
 
149
 
150
- log_message(f"Найдено {len(table_files)} файлов таблиц")
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- for table_file in table_files:
153
- try:
154
- file_path = hf_hub_download(
155
- repo_id=repo_id,
156
- filename=table_file,
157
- repo_type="dataset",
158
- token=hf_token
 
 
 
 
 
 
 
 
 
 
159
  )
 
 
 
 
 
 
160
 
161
- with open(file_path, 'r', encoding='utf-8') as f:
162
- table_data = json.load(f)
163
-
164
- doc_id = table_data.get('document_id', '')
165
- table_number = table_data.get('table_number', '')
166
- table_title = table_data.get('table_title', '')
167
-
168
- table_text = f"Таблица {table_number} - {table_title}\n"
169
-
170
- if 'headers' in table_data:
171
- table_text += "Заголовки: " + " | ".join(table_data['headers']) + "\n"
172
-
173
- if 'data' in table_data:
174
- for row in table_data['data']:
175
- if isinstance(row, list):
176
- table_text += " | ".join(str(cell) for cell in row) + "\n"
177
- elif isinstance(row, dict):
178
- table_text += " | ".join(f"{k}: {v}" for k, v in row.items()) + "\n"
179
-
180
- chunks = chunk_large_table(table_text, table_number, table_title, doc_id)
181
-
182
- for idx, chunk_text in enumerate(chunks):
183
- metadata = {
184
- 'document_id': doc_id,
185
- 'table_number': table_number,
186
- 'table_title': table_title,
187
- 'type': 'table',
188
- 'chunk_index': idx,
189
- 'section_id': f"table_{table_number}",
190
- 'section_path': f"Таблица {table_number}"
191
- }
192
-
193
- doc = Document(text=chunk_text, metadata=metadata)
194
  documents.append(doc)
195
-
196
- except Exception as e:
197
- log_message(f"Ошибка обработки таблицы {table_file}: {str(e)}")
198
-
199
- log_message(f"Загружено {len(documents)} табличных документов")
200
- return documents
201
-
202
- except Exception as e:
203
- log_message(f"Ошибка загрузки таблиц: {str(e)}")
204
- return []
205
 
206
- def load_image_data(repo_id, hf_token, image_data_dir):
207
- log_message(f"Загрузка данных изображений из {image_data_dir}")
 
 
 
 
208
 
209
- documents = []
 
 
210
 
211
- try:
212
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
213
- image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.json')]
214
-
215
- log_message(f"Найдено {len(image_files)} файлов изображений")
216
-
217
- for image_file in image_files:
218
- try:
219
- file_path = hf_hub_download(
220
- repo_id=repo_id,
221
- filename=image_file,
222
- repo_type="dataset",
223
- token=hf_token
224
- )
225
-
226
- with open(file_path, 'r', encoding='utf-8') as f:
227
- image_data = json.load(f)
228
-
229
- doc_id = image_data.get('document_id', '')
230
- image_number = image_data.get('image_number', '')
231
- image_title = image_data.get('image_title', '')
232
- image_description = image_data.get('image_description', '')
233
-
234
- text = f"Рисунок {image_number} - {image_title}\n"
235
- if image_description:
236
- text += f"Описание: {image_description}"
237
-
238
- metadata = {
239
- 'document_id': doc_id,
240
- 'image_number': image_number,
241
- 'image_title': image_title,
242
- 'type': 'image',
243
- 'section_id': f"image_{image_number}",
244
- 'section_path': f"Рисунок {image_number}"
245
- }
246
-
247
- doc = Document(text=text, metadata=metadata)
248
- documents.append(doc)
249
 
250
- except Exception as e:
251
- log_message(f"Ошибка обработки изображения {image_file}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- log_message(f"Загружено {len(documents)} документов изображений")
254
- return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
- except Exception as e:
257
- log_message(f"Ошибка загрузки изображений: {str(e)}")
258
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
- def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
261
- log_message(f"Загрузка CSV чанков из {chunks_filename}")
 
 
 
 
 
 
 
262
 
263
- documents = []
264
- chunks_df = None
265
-
266
- try:
267
- csv_path = hf_hub_download(
268
- repo_id=repo_id,
269
- filename=chunks_filename,
270
- local_dir=download_dir,
271
- repo_type="dataset",
272
- token=hf_token
273
- )
274
-
275
- chunks_df = pd.read_csv(csv_path)
276
- log_message(f"Загружено {len(chunks_df)} строк из CSV")
277
-
278
- for _, row in chunks_df.iterrows():
279
- text = row.get('chunk_text', '')
280
- if not text:
281
- continue
282
 
283
- metadata = {
284
- 'document_id': row.get('document_id', 'unknown'),
285
- 'section_id': row.get('section_id', ''),
286
- 'section_path': row.get('section_path', ''),
287
- 'type': 'text'
288
- }
289
 
290
- doc = Document(text=text, metadata=metadata)
291
- documents.append(doc)
292
-
293
- log_message(f"Создано {len(documents)} документов из CSV")
294
- return documents, chunks_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
- except Exception as e:
297
- log_message(f"Ошибка загрузки CSV: {str(e)}")
298
- return [], None
 
 
 
 
 
 
1
  import json
2
  import zipfile
 
3
  import pandas as pd
4
+ from collections import Counter, defaultdict
5
  from huggingface_hub import hf_hub_download, list_repo_files
6
  from llama_index.core import Document
7
  from llama_index.core.text_splitter import SentenceSplitter
8
  from my_logging import log_message
9
  from config import CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
+
12
+ # ============================================================================
13
+ # TEXT CHUNKING - For regular text sections
14
+ # ============================================================================
15
+
16
+ def chunk_text_document(doc):
17
+ """Split text document into semantic chunks"""
18
+ splitter = SentenceSplitter(
19
  chunk_size=CHUNK_SIZE,
20
+ chunk_overlap=CHUNK_OVERLAP,
21
+ separator=" "
22
  )
23
 
24
+ chunks = splitter.split_text(doc.text)
25
+ log_message(f" ✂️ Text split into {len(chunks)} chunks")
26
 
27
+ chunked_docs = []
28
+ for i, chunk_text in enumerate(chunks):
29
+ chunk_metadata = doc.metadata.copy()
30
+ chunk_metadata.update({
31
+ "chunk_id": i,
32
+ "total_chunks": len(chunks),
33
+ "chunk_size": len(chunk_text)
34
+ })
35
 
36
+ chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
37
+
38
+ return chunked_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+
41
+ # ============================================================================
42
+ # TABLE CHUNKING - Row-based splitting with headers preserved
43
+ # ============================================================================
44
+
45
+ def chunk_table_document(doc):
46
+ """Split large tables by rows while keeping headers in each chunk"""
47
+ table_num = doc.metadata.get('table_number', 'unknown')
48
+ table_title = doc.metadata.get('table_title', 'unknown')
49
 
50
+ lines = doc.text.strip().split('\n')
 
51
 
52
+ # Separate header info from data rows
53
+ header_lines = []
54
+ data_rows = []
55
+ found_data = False
56
+
57
+ for line in lines:
58
+ if 'Данные таблицы:' in line:
59
+ found_data = True
60
+ header_lines.append(line)
61
+ elif found_data and line.startswith('Строка'):
62
+ data_rows.append(line)
63
+ elif not found_data:
64
+ header_lines.append(line)
65
+
66
+ table_header = '\n'.join(header_lines) + '\n'
67
+
68
+ if not data_rows:
69
+ log_message(f" ⚠️ Table {table_num}: no data rows found, using standard split")
70
+ return chunk_text_document(doc)
71
+
72
+ log_message(f" 📊 Table {table_num}: found {len(data_rows)} data rows")
73
+
74
+ # Calculate space available for rows
75
+ header_size = len(table_header)
76
+ available_size = CHUNK_SIZE - header_size - 100 # Reserve 100 chars
77
+
78
+ # Split rows into chunks
79
+ chunks = []
80
+ current_rows = []
81
+ current_size = 0
82
+
83
+ for row in data_rows:
84
+ row_size = len(row) + 1 # +1 for newline
85
 
86
+ if current_size + row_size > available_size and current_rows:
87
+ # Save current chunk
88
+ chunk_text = table_header + '\n'.join(current_rows)
89
+ chunks.append(chunk_text)
90
 
91
+ # Keep last 2 rows for overlap
92
+ overlap_rows = min(2, len(current_rows))
93
+ current_rows = current_rows[-overlap_rows:]
94
+ current_size = sum(len(r) + 1 for r in current_rows)
95
+
96
+ current_rows.append(row)
97
+ current_size += row_size
98
+
99
+ # Add final chunk
100
+ if current_rows:
101
+ chunk_text = table_header + '\n'.join(current_rows)
102
+ chunks.append(chunk_text)
103
+
104
+ log_message(f" ✂️ Table split into {len(chunks)} chunks")
105
+
106
+ # Create documents with metadata
107
+ chunked_docs = []
108
+ for i, chunk_text in enumerate(chunks):
109
+ chunk_metadata = doc.metadata.copy()
110
+ chunk_metadata.update({
111
+ "chunk_id": i,
112
+ "total_chunks": len(chunks),
113
+ "chunk_size": len(chunk_text),
114
+ "is_chunked": True
115
+ })
116
+
117
+ chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
118
+
119
+ return chunked_docs
120
+
121
+
122
+ # ============================================================================
123
+ # TABLE DATA LOADING
124
+ # ============================================================================
125
+
126
+ def create_table_text(table_data):
127
+ """Format table data as readable text"""
128
+ doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
129
+ table_num = table_data.get('table_number', 'Неизвестно')
130
+ table_title = table_data.get('table_title', 'Неизвестно')
131
+ section = table_data.get('section', 'Неизвестно')
132
+
133
+ text = f"Таблица: {table_num}\n"
134
+ text += f"Название: {table_title}\n"
135
+ text += f"Документ: {doc_id}\n"
136
+ text += f"Раздел: {section}\n"
137
+
138
+ headers = table_data.get('headers', [])
139
+ if headers:
140
+ text += f"\nЗаголовки: {' | '.join(headers)}\n"
141
+
142
+ if 'data' in table_data and table_data['data']:
143
+ text += "\nДанные таблицы:\n"
144
+ for row_idx, row in enumerate(table_data['data'], start=1):
145
+ if isinstance(row, dict):
146
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
147
+ text += f"Строка {row_idx}: {row_text}\n"
148
+
149
+ return text
150
+
151
+
152
+ def load_tables_from_json(repo_id, hf_token, table_data_dir):
153
+ """Load and process all tables from JSON files"""
154
+ log_message("=" * 60)
155
+ log_message("LOADING TABLE DATA")
156
+ log_message("=" * 60)
157
+
158
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
159
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
160
+
161
+ log_message(f"Found {len(table_files)} JSON table files")
162
+
163
+ table_documents = []
164
+ stats = defaultdict(lambda: {'count': 0, 'total_size': 0, 'chunked': 0})
165
+
166
+ for file_path in table_files:
167
+ try:
168
+ local_path = hf_hub_download(
169
  repo_id=repo_id,
170
+ filename=file_path,
171
+ local_dir='',
172
  repo_type="dataset",
173
  token=hf_token
174
  )
175
 
176
+ log_message(f"\n📄 Processing: {file_path}")
177
+
178
+ with open(local_path, 'r', encoding='utf-8') as f:
179
+ data = json.load(f)
180
+
181
+ document_id = data.get('document', 'unknown')
182
+
183
+ # Process each table/sheet
184
+ sheets = data.get('sheets', [data]) if 'sheets' in data else [data]
185
+
186
+ for sheet in sorted(sheets, key=lambda x: x.get('table_number', '')):
187
+ # Skip empty tables
188
+ if not sheet.get('data'):
189
+ log_message(f" ⚠️ Skipping empty table {sheet.get('table_number')}")
190
+ continue
191
 
192
+ # Create table text
193
+ table_text = create_table_text(sheet)
194
+ table_size = len(table_text)
195
+ table_num = sheet.get('table_number', 'unknown')
196
 
197
+ # Create base document
198
+ doc = Document(
199
+ text=table_text,
200
+ metadata={
201
+ "type": "table",
202
+ "table_number": table_num,
203
+ "table_title": sheet.get('table_title', 'unknown'),
204
+ "document_id": document_id,
205
+ "section": sheet.get('section', 'unknown'),
206
+ "section_id": sheet.get('section', 'unknown'),
207
+ "total_rows": len(sheet.get('data', [])),
208
+ "content_size": table_size
209
+ }
210
+ )
211
+
212
+ # Chunk if necessary
213
+ if table_size > CHUNK_SIZE:
214
+ log_message(f" 📊 Table {table_num}: {table_size} chars > {CHUNK_SIZE}, chunking...")
215
+ docs = chunk_table_document(doc)
216
+ stats[document_id]['chunked'] += 1
217
+ else:
218
+ log_message(f" ✓ Table {table_num}: {table_size} chars, keeping whole")
219
+ docs = [doc]
220
+
221
+ table_documents.extend(docs)
222
+ stats[document_id]['count'] += len(docs)
223
+ stats[document_id]['total_size'] += table_size
 
 
 
 
 
 
 
 
 
 
224
 
225
+ except Exception as e:
226
+ log_message(f" ERROR: {str(e)}")
227
+ continue
 
 
 
228
 
229
+ # Summary
230
+ log_message("\n" + "=" * 60)
231
+ log_message("TABLE STATISTICS")
232
+ log_message("=" * 60)
233
+ total_tables = sum(s['count'] for s in stats.values())
234
+ total_chunked = sum(s['chunked'] for s in stats.values())
235
+ log_message(f"Total table chunks: {total_tables}")
236
+ log_message(f"Large tables chunked: {total_chunked}")
237
 
238
+ for doc_id, doc_stats in sorted(stats.items()):
239
+ log_message(f" • {doc_id}: {doc_stats['count']} chunks, "
240
+ f"{doc_stats['chunked']} tables split")
241
+ log_message("=" * 60)
242
 
243
+ return table_documents
244
+
245
+
246
+ # ============================================================================
247
+ # TEXT SECTIONS LOADING
248
+ # ============================================================================
249
+
250
+ def extract_section_title(text):
251
+ """Extract first line or sentence as title"""
252
+ if not text.strip():
253
+ return ""
254
 
255
+ first_line = text.strip().split('\n')[0].strip()
 
 
 
 
 
256
 
257
+ # If short and doesn't end with period, use as-is
258
+ if len(first_line) < 200 and not first_line.endswith('.'):
259
+ return first_line
 
260
 
261
+ # Otherwise extract first sentence
262
+ sentences = first_line.split('.')
263
+ if len(sentences) > 1:
264
+ return sentences[0].strip()
265
+
266
+ return first_line[:100] + "..." if len(first_line) > 100 else first_line
267
+
268
+
269
+ def extract_sections_from_json(data, document_id, document_name):
270
+ """Recursively extract all sections from JSON structure"""
271
  documents = []
272
 
273
+ if 'sections' not in data:
274
+ return documents
275
+
276
+ for section in data['sections']:
277
+ section_id = section.get('section_id', 'Unknown')
278
+ section_text = section.get('section_text', '')
279
 
280
+ if section_text.strip():
281
+ doc = Document(
282
+ text=section_text,
283
+ metadata={
284
+ "type": "text",
285
+ "document_id": document_id,
286
+ "document_name": document_name,
287
+ "section_id": section_id,
288
+ "section_title": extract_section_title(section_text)[:200],
289
+ "level": "section"
290
+ }
291
+ )
292
+ documents.append(doc)
293
 
294
+ # Process subsections recursively
295
+ for subsection in section.get('subsections', []):
296
+ subsection_id = subsection.get('subsection_id', 'Unknown')
297
+ subsection_text = subsection.get('subsection_text', '')
298
+
299
+ if subsection_text.strip():
300
+ doc = Document(
301
+ text=subsection_text,
302
+ metadata={
303
+ "type": "text",
304
+ "document_id": document_id,
305
+ "document_name": document_name,
306
+ "section_id": subsection_id,
307
+ "section_title": extract_section_title(subsection_text)[:200],
308
+ "level": "subsection",
309
+ "parent_section": section_id
310
+ }
311
  )
312
+ documents.append(doc)
313
+
314
+ # Process sub-subsections
315
+ for sub_subsection in subsection.get('sub_subsections', []):
316
+ sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
317
+ sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
318
 
319
+ if sub_subsection_text.strip():
320
+ doc = Document(
321
+ text=sub_subsection_text,
322
+ metadata={
323
+ "type": "text",
324
+ "document_id": document_id,
325
+ "document_name": document_name,
326
+ "section_id": sub_subsection_id,
327
+ "section_title": extract_section_title(sub_subsection_text)[:200],
328
+ "level": "sub_subsection",
329
+ "parent_section": subsection_id
330
+ }
331
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  documents.append(doc)
333
+
334
+ return documents
 
 
 
 
 
 
 
 
335
 
336
+
337
+ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
338
+ """Load text sections from JSON files and ZIP archives"""
339
+ log_message("=" * 60)
340
+ log_message("LOADING TEXT DOCUMENTS")
341
+ log_message("=" * 60)
342
 
343
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
344
+ zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
345
+ json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
346
 
347
+ log_message(f"Found {len(zip_files)} ZIP files and {len(json_files)} JSON files")
348
+
349
+ all_documents = []
350
+
351
+ # Process ZIP files
352
+ for zip_path in zip_files:
353
+ try:
354
+ log_message(f"\n📦 Processing ZIP: {zip_path}")
355
+ local_zip = hf_hub_download(
356
+ repo_id=repo_id,
357
+ filename=zip_path,
358
+ local_dir=download_dir,
359
+ repo_type="dataset",
360
+ token=hf_token
361
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
+ with zipfile.ZipFile(local_zip, 'r') as zip_ref:
364
+ json_in_zip = [f for f in zip_ref.namelist()
365
+ if f.endswith('.json') and not f.startswith('__MACOSX')]
366
+
367
+ for json_file in json_in_zip:
368
+ with zip_ref.open(json_file) as f:
369
+ data = json.load(f)
370
+
371
+ metadata = data.get('document_metadata', {})
372
+ doc_id = metadata.get('document_id', 'unknown')
373
+ doc_name = metadata.get('document_name', 'unknown')
374
+
375
+ docs = extract_sections_from_json(data, doc_id, doc_name)
376
+ all_documents.extend(docs)
377
+ log_message(f" ✓ {json_file}: {len(docs)} sections")
378
 
379
+ except Exception as e:
380
+ log_message(f" ❌ ERROR: {str(e)}")
381
+ continue
382
+
383
+ # Process direct JSON files
384
+ for json_path in json_files:
385
+ try:
386
+ log_message(f"\n📄 Processing JSON: {json_path}")
387
+ local_path = hf_hub_download(
388
+ repo_id=repo_id,
389
+ filename=json_path,
390
+ local_dir=download_dir,
391
+ repo_type="dataset",
392
+ token=hf_token
393
+ )
394
+
395
+ with open(local_path, 'r', encoding='utf-8') as f:
396
+ data = json.load(f)
397
+
398
+ metadata = data.get('document_metadata', {})
399
+ doc_id = metadata.get('document_id', 'unknown')
400
+ doc_name = metadata.get('document_name', 'unknown')
401
+
402
+ docs = extract_sections_from_json(data, doc_id, doc_name)
403
+ all_documents.extend(docs)
404
+ log_message(f" ✓ Extracted {len(docs)} sections")
405
 
406
+ except Exception as e:
407
+ log_message(f" ERROR: {str(e)}")
408
+ continue
409
+
410
+ log_message(f"\n✓ Total text sections: {len(all_documents)}")
411
+
412
+ # Apply chunking
413
+ chunked_docs = []
414
+ chunked_count = 0
415
+
416
+ for doc in all_documents:
417
+ if len(doc.text) > CHUNK_SIZE:
418
+ log_message(f" ✂️ Chunking section '{doc.metadata.get('section_id')}' "
419
+ f"({len(doc.text)} chars)")
420
+ chunks = chunk_text_document(doc)
421
+ chunked_docs.extend(chunks)
422
+ chunked_count += 1
423
+ else:
424
+ chunked_docs.append(doc)
425
+
426
+ log_message(f"\n✓ After chunking: {len(chunked_docs)} total chunks")
427
+ log_message(f"✓ Sections chunked: {chunked_count}")
428
+ log_message("=" * 60)
429
+
430
+ return chunked_docs
431
+
432
 
433
+ # ============================================================================
434
+ # IMAGE DATA LOADING
435
+ # ============================================================================
436
+
437
+ def load_image_documents(repo_id, hf_token, image_data_dir):
438
+ """Load image metadata from CSV files"""
439
+ log_message("=" * 60)
440
+ log_message("LOADING IMAGE METADATA")
441
+ log_message("=" * 60)
442
 
443
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
444
+ image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
445
+
446
+ log_message(f"Found {len(image_files)} CSV image files")
447
+
448
+ image_docs = []
449
+
450
+ for csv_path in image_files:
451
+ try:
452
+ log_message(f"\n📷 Processing: {csv_path}")
453
+ local_path = hf_hub_download(
454
+ repo_id=repo_id,
455
+ filename=csv_path,
456
+ local_dir='',
457
+ repo_type="dataset",
458
+ token=hf_token
459
+ )
 
 
460
 
461
+ df = pd.read_csv(local_path)
462
+ log_message(f" ✓ Loaded {len(df)} image records")
 
 
 
 
463
 
464
+ for _, row in df.iterrows():
465
+ text = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
466
+ text += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
467
+ text += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
468
+ text += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
469
+ text += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
470
+ text += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
471
+
472
+ doc = Document(
473
+ text=text,
474
+ metadata={
475
+ "type": "image",
476
+ "image_number": str(row.get('№ Изображения', 'unknown')),
477
+ "image_title": str(row.get('Название изображения', 'unknown')),
478
+ "image_description": str(row.get('Описание изображение', 'unknown')),
479
+ "document_id": str(row.get('Обозначение документа', 'unknown')),
480
+ "file_path": str(row.get('Файл изображения', 'unknown')),
481
+ "section": str(row.get('Раздел документа', 'Неизвестно'))
482
+ }
483
+ )
484
+ image_docs.append(doc)
485
 
486
+ except Exception as e:
487
+ log_message(f" ERROR: {str(e)}")
488
+ continue
489
+
490
+ log_message(f"\n✓ Total image documents: {len(image_docs)}")
491
+ log_message("=" * 60)
492
+
493
+ return image_docs