MrSimple07 commited on
Commit
9160af0
·
1 Parent(s): 4775037

new documents_prep

Browse files
Files changed (2) hide show
  1. app.py +6 -1
  2. documents_prep.py +512 -332
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
- from documents_prep import load_json_documents, load_table_data, load_image_data
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
@@ -127,6 +127,11 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
127
  json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
128
  all_documents.extend(json_documents)
129
  chunk_info.extend(json_chunk_info)
 
 
 
 
 
130
 
131
  if table_data_dir:
132
  log_message("Добавляю табличные данные")
 
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
+ from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
 
127
  json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
128
  all_documents.extend(json_documents)
129
  chunk_info.extend(json_chunk_info)
130
+ else:
131
+ if chunks_filename:
132
+ log_message("Загружаем данные из CSV")
133
+ csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
134
+ all_documents.extend(csv_documents)
135
 
136
  if table_data_dir:
137
  log_message("Добавляю табличные данные")
documents_prep.py CHANGED
@@ -1,7 +1,7 @@
1
  import json
2
  import zipfile
3
  import pandas as pd
4
- from collections import Counter, defaultdict
5
  from huggingface_hub import hf_hub_download, list_repo_files
6
  from llama_index.core import Document
7
  from llama_index.core.text_splitter import SentenceSplitter
@@ -10,26 +10,25 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
 
12
  # ============================================================================
13
- # TEXT CHUNKING - For regular text sections
14
  # ============================================================================
15
 
16
  def chunk_text_document(doc):
17
- """Split text document into semantic chunks"""
18
- splitter = SentenceSplitter(
19
  chunk_size=CHUNK_SIZE,
20
  chunk_overlap=CHUNK_OVERLAP,
21
  separator=" "
22
  )
23
 
24
- chunks = splitter.split_text(doc.text)
25
- log_message(f" ✂️ Text split into {len(chunks)} chunks")
26
-
27
  chunked_docs = []
28
- for i, chunk_text in enumerate(chunks):
 
29
  chunk_metadata = doc.metadata.copy()
30
  chunk_metadata.update({
31
  "chunk_id": i,
32
- "total_chunks": len(chunks),
33
  "chunk_size": len(chunk_text)
34
  })
35
 
@@ -39,226 +38,265 @@ def chunk_text_document(doc):
39
 
40
 
41
  # ============================================================================
42
- # TABLE CHUNKING - Row-based splitting with headers preserved
43
  # ============================================================================
44
 
45
- def chunk_table_document(doc):
46
- """Split large tables by rows while keeping headers in each chunk"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  table_num = doc.metadata.get('table_number', 'unknown')
48
  table_title = doc.metadata.get('table_title', 'unknown')
49
 
 
50
  lines = doc.text.strip().split('\n')
51
 
52
- # Separate header info from data rows
53
- header_lines = []
54
  data_rows = []
55
- found_data = False
56
 
57
  for line in lines:
58
- if 'Данные таблицы:' in line:
59
- found_data = True
60
- header_lines.append(line)
61
- elif found_data and line.startswith('Строка'):
62
  data_rows.append(line)
63
- elif not found_data:
64
- header_lines.append(line)
65
 
66
- table_header = '\n'.join(header_lines) + '\n'
67
 
 
68
  if not data_rows:
69
- log_message(f" ⚠️ Table {table_num}: no data rows found, using standard split")
70
  return chunk_text_document(doc)
71
 
72
- log_message(f" 📊 Table {table_num}: found {len(data_rows)} data rows")
73
 
74
- # Calculate space available for rows
75
  header_size = len(table_header)
76
- available_size = CHUNK_SIZE - header_size - 100 # Reserve 100 chars
77
 
78
- # Split rows into chunks
79
- chunks = []
80
- current_rows = []
81
  current_size = 0
82
 
83
  for row in data_rows:
84
- row_size = len(row) + 1 # +1 for newline
85
 
86
- if current_size + row_size > available_size and current_rows:
87
- # Save current chunk
88
- chunk_text = table_header + '\n'.join(current_rows)
89
- chunks.append(chunk_text)
 
90
 
91
  # Keep last 2 rows for overlap
92
- overlap_rows = min(2, len(current_rows))
93
- current_rows = current_rows[-overlap_rows:]
94
- current_size = sum(len(r) + 1 for r in current_rows)
95
 
96
- current_rows.append(row)
97
  current_size += row_size
98
 
99
- # Add final chunk
100
- if current_rows:
101
- chunk_text = table_header + '\n'.join(current_rows)
102
- chunks.append(chunk_text)
 
103
 
104
- log_message(f" ✂️ Table split into {len(chunks)} chunks")
105
 
106
- # Create documents with metadata
107
  chunked_docs = []
108
- for i, chunk_text in enumerate(chunks):
 
 
109
  chunk_metadata = doc.metadata.copy()
110
  chunk_metadata.update({
111
  "chunk_id": i,
112
- "total_chunks": len(chunks),
113
  "chunk_size": len(chunk_text),
114
- "is_chunked": True
 
115
  })
116
 
117
- chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
 
 
 
 
 
 
 
118
 
119
  return chunked_docs
120
 
121
 
122
- # ============================================================================
123
- # TABLE DATA LOADING
124
- # ============================================================================
125
-
126
- def load_table_data(table_data):
127
- """Format table data as readable text"""
128
- doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
129
  table_num = table_data.get('table_number', 'Неизвестно')
130
  table_title = table_data.get('table_title', 'Неизвестно')
131
  section = table_data.get('section', 'Неизвестно')
132
 
133
- text = f"Таблица: {table_num}\n"
134
- text += f"Название: {table_title}\n"
135
- text += f"Документ: {doc_id}\n"
136
- text += f"Раздел: {section}\n"
137
-
138
- headers = table_data.get('headers', [])
139
- if headers:
140
- text += f"\nЗаголовки: {' | '.join(headers)}\n"
141
-
142
- if 'data' in table_data and table_data['data']:
143
- text += "\nДанные таблицы:\n"
144
- for row_idx, row in enumerate(table_data['data'], start=1):
145
- if isinstance(row, dict):
146
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
147
- text += f"Строка {row_idx}: {row_text}\n"
 
 
 
 
 
 
148
 
149
- return text
 
 
 
 
 
 
150
 
151
 
152
- def load_tables_from_json(repo_id, hf_token, table_data_dir):
153
- """Load and process all tables from JSON files"""
154
  log_message("=" * 60)
155
- log_message("LOADING TABLE DATA")
156
  log_message("=" * 60)
157
 
158
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
159
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
160
-
161
- log_message(f"Found {len(table_files)} JSON table files")
162
-
163
- table_documents = []
164
- stats = defaultdict(lambda: {'count': 0, 'total_size': 0, 'chunked': 0})
165
-
166
- for file_path in table_files:
167
- try:
168
- local_path = hf_hub_download(
169
- repo_id=repo_id,
170
- filename=file_path,
171
- local_dir='',
172
- repo_type="dataset",
173
- token=hf_token
174
- )
175
-
176
- log_message(f"\n📄 Processing: {file_path}")
177
-
178
- with open(local_path, 'r', encoding='utf-8') as f:
179
- data = json.load(f)
180
-
181
- document_id = data.get('document', 'unknown')
182
-
183
- # Process each table/sheet
184
- sheets = data.get('sheets', [data]) if 'sheets' in data else [data]
185
-
186
- for sheet in sorted(sheets, key=lambda x: x.get('table_number', '')):
187
- # Skip empty tables
188
- if not sheet.get('data'):
189
- log_message(f" ⚠️ Skipping empty table {sheet.get('table_number')}")
190
- continue
191
-
192
- # Create table text
193
- table_text = load_table_data(sheet)
194
- table_size = len(table_text)
195
- table_num = sheet.get('table_number', 'unknown')
196
-
197
- # Create base document
198
- doc = Document(
199
- text=table_text,
200
- metadata={
201
- "type": "table",
202
- "table_number": table_num,
203
- "table_title": sheet.get('table_title', 'unknown'),
204
- "document_id": document_id,
205
- "section": sheet.get('section', 'unknown'),
206
- "section_id": sheet.get('section', 'unknown'),
207
- "total_rows": len(sheet.get('data', [])),
208
- "content_size": table_size
209
- }
210
  )
211
 
212
- # Chunk if necessary
213
- if table_size > CHUNK_SIZE:
214
- log_message(f" 📊 Table {table_num}: {table_size} chars > {CHUNK_SIZE}, chunking...")
215
- docs = chunk_table_document(doc)
216
- stats[document_id]['chunked'] += 1
217
- else:
218
- log_message(f" ✓ Table {table_num}: {table_size} chars, keeping whole")
219
- docs = [doc]
220
 
221
- table_documents.extend(docs)
222
- stats[document_id]['count'] += len(docs)
223
- stats[document_id]['total_size'] += table_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- except Exception as e:
226
- log_message(f" ERROR: {str(e)}")
227
- continue
228
-
229
- # Summary
230
- log_message("\n" + "=" * 60)
231
- log_message("TABLE STATISTICS")
232
- log_message("=" * 60)
233
- total_tables = sum(s['count'] for s in stats.values())
234
- total_chunked = sum(s['chunked'] for s in stats.values())
235
- log_message(f"Total table chunks: {total_tables}")
236
- log_message(f"Large tables chunked: {total_chunked}")
237
-
238
- for doc_id, doc_stats in sorted(stats.items()):
239
- log_message(f" • {doc_id}: {doc_stats['count']} chunks, "
240
- f"{doc_stats['chunked']} tables split")
241
- log_message("=" * 60)
242
-
243
- return table_documents
244
 
245
 
246
  # ============================================================================
247
- # TEXT SECTIONS LOADING
248
  # ============================================================================
249
 
250
- def extract_section_title(text):
251
- """Extract first line or sentence as title"""
252
- if not text.strip():
253
  return ""
254
 
255
- first_line = text.strip().split('\n')[0].strip()
256
 
257
- # If short and doesn't end with period, use as-is
258
  if len(first_line) < 200 and not first_line.endswith('.'):
259
  return first_line
260
 
261
- # Otherwise extract first sentence
262
  sentences = first_line.split('.')
263
  if len(sentences) > 1:
264
  return sentences[0].strip()
@@ -266,8 +304,8 @@ def extract_section_title(text):
266
  return first_line[:100] + "..." if len(first_line) > 100 else first_line
267
 
268
 
269
- def extract_sections_from_json(data, document_id, document_name):
270
- """Recursively extract all sections from JSON structure"""
271
  documents = []
272
 
273
  if 'sections' not in data:
@@ -278,6 +316,7 @@ def extract_sections_from_json(data, document_id, document_name):
278
  section_text = section.get('section_text', '')
279
 
280
  if section_text.strip():
 
281
  doc = Document(
282
  text=section_text,
283
  metadata={
@@ -285,48 +324,32 @@ def extract_sections_from_json(data, document_id, document_name):
285
  "document_id": document_id,
286
  "document_name": document_name,
287
  "section_id": section_id,
288
- "section_title": extract_section_title(section_text)[:200],
 
289
  "level": "section"
290
  }
291
  )
292
  documents.append(doc)
293
 
294
  # Process subsections recursively
295
- for subsection in section.get('subsections', []):
296
- subsection_id = subsection.get('subsection_id', 'Unknown')
297
- subsection_text = subsection.get('subsection_text', '')
298
-
299
- if subsection_text.strip():
300
- doc = Document(
301
- text=subsection_text,
302
- metadata={
303
- "type": "text",
304
- "document_id": document_id,
305
- "document_name": document_name,
306
- "section_id": subsection_id,
307
- "section_title": extract_section_title(subsection_text)[:200],
308
- "level": "subsection",
309
- "parent_section": section_id
310
- }
311
- )
312
- documents.append(doc)
313
-
314
- # Process sub-subsections
315
- for sub_subsection in subsection.get('sub_subsections', []):
316
- sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
317
- sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
318
 
319
- if sub_subsection_text.strip():
 
320
  doc = Document(
321
- text=sub_subsection_text,
322
  metadata={
323
  "type": "text",
324
  "document_id": document_id,
325
  "document_name": document_name,
326
- "section_id": sub_subsection_id,
327
- "section_title": extract_section_title(sub_subsection_text)[:200],
328
- "level": "sub_subsection",
329
- "parent_section": subsection_id
 
330
  }
331
  )
332
  documents.append(doc)
@@ -335,159 +358,316 @@ def extract_sections_from_json(data, document_id, document_name):
335
 
336
 
337
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
338
- """Load text sections from JSON files and ZIP archives"""
339
  log_message("=" * 60)
340
- log_message("LOADING TEXT DOCUMENTS")
341
  log_message("=" * 60)
342
 
343
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
344
- zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
345
- json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
346
-
347
- log_message(f"Found {len(zip_files)} ZIP files and {len(json_files)} JSON files")
348
-
349
- all_documents = []
350
-
351
- # Process ZIP files
352
- for zip_path in zip_files:
353
- try:
354
- log_message(f"\n📦 Processing ZIP: {zip_path}")
355
- local_zip = hf_hub_download(
356
- repo_id=repo_id,
357
- filename=zip_path,
358
- local_dir=download_dir,
359
- repo_type="dataset",
360
- token=hf_token
361
- )
362
-
363
- with zipfile.ZipFile(local_zip, 'r') as zip_ref:
364
- json_in_zip = [f for f in zip_ref.namelist()
365
- if f.endswith('.json') and not f.startswith('__MACOSX')]
366
 
367
- for json_file in json_in_zip:
368
- with zip_ref.open(json_file) as f:
369
- data = json.load(f)
370
 
371
- metadata = data.get('document_metadata', {})
372
- doc_id = metadata.get('document_id', 'unknown')
373
- doc_name = metadata.get('document_name', 'unknown')
374
-
375
- docs = extract_sections_from_json(data, doc_id, doc_name)
376
- all_documents.extend(docs)
377
- log_message(f" ✓ {json_file}: {len(docs)} sections")
378
-
379
- except Exception as e:
380
- log_message(f" ❌ ERROR: {str(e)}")
381
- continue
382
-
383
- # Process direct JSON files
384
- for json_path in json_files:
385
- try:
386
- log_message(f"\n📄 Processing JSON: {json_path}")
387
- local_path = hf_hub_download(
388
- repo_id=repo_id,
389
- filename=json_path,
390
- local_dir=download_dir,
391
- repo_type="dataset",
392
- token=hf_token
393
- )
394
-
395
- with open(local_path, 'r', encoding='utf-8') as f:
396
- data = json.load(f)
397
-
398
- metadata = data.get('document_metadata', {})
399
- doc_id = metadata.get('document_id', 'unknown')
400
- doc_name = metadata.get('document_name', 'unknown')
401
-
402
- docs = extract_sections_from_json(data, doc_id, doc_name)
403
- all_documents.extend(docs)
404
- log_message(f" ✓ Extracted {len(docs)} sections")
405
 
406
- except Exception as e:
407
- log_message(f" ❌ ERROR: {str(e)}")
408
- continue
409
-
410
- log_message(f"\n✓ Total text sections: {len(all_documents)}")
411
-
412
- # Apply chunking
413
- chunked_docs = []
414
- chunked_count = 0
415
-
416
- for doc in all_documents:
417
- if len(doc.text) > CHUNK_SIZE:
418
- log_message(f" ✂️ Chunking section '{doc.metadata.get('section_id')}' "
419
- f"({len(doc.text)} chars)")
420
- chunks = chunk_text_document(doc)
421
- chunked_docs.extend(chunks)
422
- chunked_count += 1
423
- else:
424
- chunked_docs.append(doc)
425
-
426
- log_message(f"\n✓ After chunking: {len(chunked_docs)} total chunks")
427
- log_message(f"✓ Sections chunked: {chunked_count}")
428
- log_message("=" * 60)
429
-
430
- return chunked_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
 
433
  # ============================================================================
434
- # IMAGE DATA LOADING
435
  # ============================================================================
436
 
437
  def load_image_data(repo_id, hf_token, image_data_dir):
438
  """Load image metadata from CSV files"""
439
  log_message("=" * 60)
440
- log_message("LOADING IMAGE METADATA")
441
  log_message("=" * 60)
442
 
443
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
444
- image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
445
-
446
- log_message(f"Found {len(image_files)} CSV image files")
447
-
448
- image_docs = []
449
-
450
- for csv_path in image_files:
451
- try:
452
- log_message(f"\n📷 Processing: {csv_path}")
453
- local_path = hf_hub_download(
454
- repo_id=repo_id,
455
- filename=csv_path,
456
- local_dir='',
457
- repo_type="dataset",
458
- token=hf_token
459
- )
460
-
461
- df = pd.read_csv(local_path)
462
- log_message(f" Loaded {len(df)} image records")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
- for _, row in df.iterrows():
465
- text = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
466
- text += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
467
- text += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
468
- text += f"Документ: {row.get('Обозначение документа', 'Не��звестно')}\n"
469
- text += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
470
- text += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
 
 
 
 
 
 
 
 
 
 
 
 
471
 
472
- doc = Document(
473
- text=text,
474
- metadata={
475
- "type": "image",
476
- "image_number": str(row.get('№ Изображения', 'unknown')),
477
- "image_title": str(row.get('Название изображения', 'unknown')),
478
- "image_description": str(row.get('Описание изображение', 'unknown')),
479
- "document_id": str(row.get('Обозначение документа', 'unknown')),
480
- "file_path": str(row.get('Файл изображения', 'unknown')),
481
- "section": str(row.get('Раздел документа', 'Неизвестно'))
482
- }
483
- )
484
- image_docs.append(doc)
 
 
 
 
 
 
 
 
 
485
 
486
- except Exception as e:
487
- log_message(f" ❌ ERROR: {str(e)}")
488
- continue
489
-
490
- log_message(f"\n✓ Total image documents: {len(image_docs)}")
491
- log_message("=" * 60)
492
-
493
- return image_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import zipfile
3
  import pandas as pd
4
+ from collections import Counter
5
  from huggingface_hub import hf_hub_download, list_repo_files
6
  from llama_index.core import Document
7
  from llama_index.core.text_splitter import SentenceSplitter
 
10
 
11
 
12
  # ============================================================================
13
+ # TEXT CHUNKING
14
  # ============================================================================
15
 
16
  def chunk_text_document(doc):
17
+ """Split text document into chunks using sentence splitter"""
18
+ text_splitter = SentenceSplitter(
19
  chunk_size=CHUNK_SIZE,
20
  chunk_overlap=CHUNK_OVERLAP,
21
  separator=" "
22
  )
23
 
24
+ text_chunks = text_splitter.split_text(doc.text)
 
 
25
  chunked_docs = []
26
+
27
+ for i, chunk_text in enumerate(text_chunks):
28
  chunk_metadata = doc.metadata.copy()
29
  chunk_metadata.update({
30
  "chunk_id": i,
31
+ "total_chunks": len(text_chunks),
32
  "chunk_size": len(chunk_text)
33
  })
34
 
 
38
 
39
 
40
  # ============================================================================
41
+ # TABLE PROCESSING
42
  # ============================================================================
43
 
44
+ def extract_table_metadata(table_text):
45
+ """Extract key terms from table for enrichment"""
46
+ words = table_text.split()
47
+
48
+ # Filter stopwords and short words
49
+ stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
50
+ filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
51
+
52
+ # Get top 15 most common terms
53
+ common = Counter(filtered).most_common(15)
54
+ key_terms = [w for w, _ in common]
55
+
56
+ return {
57
+ "summary": f"Таблица содержит {len(words)} слов",
58
+ "key_terms": key_terms
59
+ }
60
+
61
+
62
+ def create_table_content(table_data):
63
+ """Format table data as text"""
64
+ doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
65
+ table_num = table_data.get('table_number', 'Неизвестно')
66
+ table_title = table_data.get('table_title', 'Неизвестно')
67
+ section = table_data.get('section', 'Неизвестно')
68
+
69
+ content = f"Таблица: {table_num}\n"
70
+ content += f"Название: {table_title}\n"
71
+ content += f"Документ: {doc_id}\n"
72
+ content += f"Раздел: {section}\n"
73
+
74
+ # Add headers
75
+ headers = table_data.get('headers', [])
76
+ if headers:
77
+ content += f"\nЗаголовки: {' | '.join(headers)}\n"
78
+
79
+ # Add data rows
80
+ if 'data' in table_data and isinstance(table_data['data'], list):
81
+ content += "\nДанные таблицы:\n"
82
+ for row_idx, row in enumerate(table_data['data'], start=1):
83
+ if isinstance(row, dict):
84
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
85
+ content += f"Строка {row_idx}: {row_text}\n"
86
+
87
+ return content
88
+
89
+
90
+ def chunk_table_by_rows(doc):
91
+ """Split large table into chunks by rows, preserving headers"""
92
+ # Extract metadata
93
+ table_metadata = extract_table_metadata(doc.text)
94
  table_num = doc.metadata.get('table_number', 'unknown')
95
  table_title = doc.metadata.get('table_title', 'unknown')
96
 
97
+ # Parse table structure
98
  lines = doc.text.strip().split('\n')
99
 
100
+ # Separate header and data rows
101
+ table_header_lines = []
102
  data_rows = []
103
+ in_data = False
104
 
105
  for line in lines:
106
+ if line.startswith('Данные таблицы:'):
107
+ in_data = True
108
+ table_header_lines.append(line)
109
+ elif in_data and line.startswith('Строка'):
110
  data_rows.append(line)
111
+ elif not in_data:
112
+ table_header_lines.append(line)
113
 
114
+ table_header = '\n'.join(table_header_lines) + '\n'
115
 
116
+ # If no rows, use standard text splitting
117
  if not data_rows:
118
+ log_message(f" ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
119
  return chunk_text_document(doc)
120
 
121
+ log_message(f" 📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
122
 
123
+ # Row-based chunking
124
  header_size = len(table_header)
125
+ available_size = CHUNK_SIZE - header_size - 300 # Reserve space for enrichment
126
 
127
+ text_chunks = []
128
+ current_chunk_rows = []
 
129
  current_size = 0
130
 
131
  for row in data_rows:
132
+ row_size = len(row) + 1
133
 
134
+ # If adding this row exceeds limit, create chunk
135
+ if current_size + row_size > available_size and current_chunk_rows:
136
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
137
+ text_chunks.append(chunk_text)
138
+ log_message(f" ✂️ Создан чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
139
 
140
  # Keep last 2 rows for overlap
141
+ overlap_count = min(2, len(current_chunk_rows))
142
+ current_chunk_rows = current_chunk_rows[-overlap_count:]
143
+ current_size = sum(len(r) + 1 for r in current_chunk_rows)
144
 
145
+ current_chunk_rows.append(row)
146
  current_size += row_size
147
 
148
+ # Final chunk
149
+ if current_chunk_rows:
150
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
151
+ text_chunks.append(chunk_text)
152
+ log_message(f" ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
153
 
154
+ log_message(f" 📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
155
 
156
+ # Create enriched chunks with metadata
157
  chunked_docs = []
158
+ key_terms = table_metadata.get("key_terms", [])
159
+
160
+ for i, chunk_text in enumerate(text_chunks):
161
  chunk_metadata = doc.metadata.copy()
162
  chunk_metadata.update({
163
  "chunk_id": i,
164
+ "total_chunks": len(text_chunks),
165
  "chunk_size": len(chunk_text),
166
+ "is_chunked": True,
167
+ "key_terms": key_terms
168
  })
169
 
170
+ # Add enrichment prefix
171
+ terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
172
+ enriched_text = f"""[Таблица {table_num}: {table_title}]
173
+ [Ключевые термины: {terms_str}]
174
+
175
+ {chunk_text}"""
176
+
177
+ chunked_docs.append(Document(text=enriched_text, metadata=chunk_metadata))
178
 
179
  return chunked_docs
180
 
181
 
182
+ def table_to_document(table_data, document_id=None):
183
+ """Convert table data to Document, chunking if needed"""
184
+ if not isinstance(table_data, dict):
185
+ log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
186
+ return []
187
+
188
+ doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
189
  table_num = table_data.get('table_number', 'Неизвестно')
190
  table_title = table_data.get('table_title', 'Неизвестно')
191
  section = table_data.get('section', 'Неизвестно')
192
 
193
+ table_rows = table_data.get('data', [])
194
+ if not table_rows:
195
+ log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} - нет данных")
196
+ return []
197
+
198
+ content = create_table_content(table_data)
199
+ content_size = len(content)
200
+
201
+ base_doc = Document(
202
+ text=content,
203
+ metadata={
204
+ "type": "table",
205
+ "table_number": table_num,
206
+ "table_title": table_title,
207
+ "document_id": doc_id,
208
+ "section": section,
209
+ "section_id": section,
210
+ "total_rows": len(table_rows),
211
+ "content_size": content_size
212
+ }
213
+ )
214
 
215
+ # Chunk if needed
216
+ if content_size > CHUNK_SIZE:
217
+ log_message(f"📊 CHUNKING: Таблица {table_num} | Размер: {content_size} > {CHUNK_SIZE}")
218
+ return chunk_table_by_rows(base_doc)
219
+ else:
220
+ log_message(f"✓ Таблица {table_num} | Размер: {content_size} символов | Строк: {len(table_rows)}")
221
+ return [base_doc]
222
 
223
 
224
+ def load_table_data(repo_id, hf_token, table_data_dir):
225
+ """Load all table data from HuggingFace repo"""
226
  log_message("=" * 60)
227
+ log_message("ЗАГРУЗКА ТАБЛИЧНЫХ ДАННЫХ")
228
  log_message("=" * 60)
229
 
230
+ try:
231
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
232
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
233
+
234
+ log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
235
+
236
+ table_documents = []
237
+
238
+ for file_path in table_files:
239
+ try:
240
+ local_path = hf_hub_download(
241
+ repo_id=repo_id,
242
+ filename=file_path,
243
+ local_dir='',
244
+ repo_type="dataset",
245
+ token=hf_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  )
247
 
248
+ log_message(f"\nОбработка файла: {file_path}")
 
 
 
 
 
 
 
249
 
250
+ with open(local_path, 'r', encoding='utf-8') as f:
251
+ table_data = json.load(f)
252
+
253
+ if isinstance(table_data, dict):
254
+ document_id = table_data.get('document', 'unknown')
255
+
256
+ # Process sheets if present
257
+ if 'sheets' in table_data:
258
+ sorted_sheets = sorted(
259
+ table_data['sheets'],
260
+ key=lambda sheet: sheet.get('table_number', '')
261
+ )
262
+
263
+ for sheet in sorted_sheets:
264
+ sheet['document'] = document_id
265
+ docs_list = table_to_document(sheet, document_id)
266
+ table_documents.extend(docs_list)
267
+ else:
268
+ docs_list = table_to_document(table_data, document_id)
269
+ table_documents.extend(docs_list)
270
+
271
+ except Exception as e:
272
+ log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
273
+ continue
274
 
275
+ log_message(f"\n{'='*60}")
276
+ log_message(f"Загружено {len(table_documents)} табличных документов")
277
+ log_message("=" * 60)
278
+
279
+ return table_documents
280
+
281
+ except Exception as e:
282
+ log_message(f" ОШИБКА загрузки таблиц: {str(e)}")
283
+ return []
 
 
 
 
 
 
 
 
 
 
284
 
285
 
286
  # ============================================================================
287
+ # JSON TEXT DOCUMENTS
288
  # ============================================================================
289
 
290
+ def extract_section_title(section_text):
291
+ """Extract clean title from section text"""
292
+ if not section_text.strip():
293
  return ""
294
 
295
+ first_line = section_text.strip().split('\n')[0].strip()
296
 
 
297
  if len(first_line) < 200 and not first_line.endswith('.'):
298
  return first_line
299
 
 
300
  sentences = first_line.split('.')
301
  if len(sentences) > 1:
302
  return sentences[0].strip()
 
304
  return first_line[:100] + "..." if len(first_line) > 100 else first_line
305
 
306
 
307
+ def extract_text_from_json(data, document_id, document_name):
308
+ """Extract text documents from JSON structure"""
309
  documents = []
310
 
311
  if 'sections' not in data:
 
316
  section_text = section.get('section_text', '')
317
 
318
  if section_text.strip():
319
+ section_title = extract_section_title(section_text)
320
  doc = Document(
321
  text=section_text,
322
  metadata={
 
324
  "document_id": document_id,
325
  "document_name": document_name,
326
  "section_id": section_id,
327
+ "section_text": section_title[:200],
328
+ "section_path": section_id,
329
  "level": "section"
330
  }
331
  )
332
  documents.append(doc)
333
 
334
  # Process subsections recursively
335
+ if 'subsections' in section:
336
+ for subsection in section['subsections']:
337
+ subsection_id = subsection.get('subsection_id', 'Unknown')
338
+ subsection_text = subsection.get('subsection_text', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ if subsection_text.strip():
341
+ subsection_title = extract_section_title(subsection_text)
342
  doc = Document(
343
+ text=subsection_text,
344
  metadata={
345
  "type": "text",
346
  "document_id": document_id,
347
  "document_name": document_name,
348
+ "section_id": subsection_id,
349
+ "section_text": subsection_title[:200],
350
+ "section_path": f"{section_id}.{subsection_id}",
351
+ "level": "subsection",
352
+ "parent_section": section_id
353
  }
354
  )
355
  documents.append(doc)
 
358
 
359
 
360
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
361
+ """Load JSON documents from HuggingFace repo"""
362
  log_message("=" * 60)
363
+ log_message("ЗАГРУЗКА JSON ДОКУМЕНТОВ")
364
  log_message("=" * 60)
365
 
366
+ try:
367
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
368
+ zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
369
+ json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
370
+
371
+ log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} JSON файлов")
372
+
373
+ all_documents = []
374
+
375
+ # Process ZIP files
376
+ for zip_file_path in zip_files:
377
+ try:
378
+ log_message(f"Загружаю ZIP: {zip_file_path}")
379
+ local_zip_path = hf_hub_download(
380
+ repo_id=repo_id,
381
+ filename=zip_file_path,
382
+ local_dir=download_dir,
383
+ repo_type="dataset",
384
+ token=hf_token
385
+ )
 
 
 
386
 
387
+ with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
388
+ json_files_in_zip = [f for f in zip_ref.namelist()
389
+ if f.endswith('.json') and not f.startswith('__MACOSX')]
390
 
391
+ for json_file in json_files_in_zip:
392
+ with zip_ref.open(json_file) as f:
393
+ json_data = json.load(f)
394
+
395
+ metadata = json_data.get('document_metadata', {})
396
+ doc_id = metadata.get('document_id', 'unknown')
397
+ doc_name = metadata.get('document_name', 'unknown')
398
+
399
+ docs = extract_text_from_json(json_data, doc_id, doc_name)
400
+ all_documents.extend(docs)
401
+
402
+ log_message(f"Извлечено документов из ZIP: {len(all_documents)}")
403
+
404
+ except Exception as e:
405
+ log_message(f"❌ ОШИБКА ZIP {zip_file_path}: {str(e)}")
406
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
+ # Process direct JSON files
409
+ for file_path in json_files:
410
+ try:
411
+ local_path = hf_hub_download(
412
+ repo_id=repo_id,
413
+ filename=file_path,
414
+ local_dir=download_dir,
415
+ repo_type="dataset",
416
+ token=hf_token
417
+ )
418
+
419
+ with open(local_path, 'r', encoding='utf-8') as f:
420
+ json_data = json.load(f)
421
+
422
+ metadata = json_data.get('document_metadata', {})
423
+ doc_id = metadata.get('document_id', 'unknown')
424
+ doc_name = metadata.get('document_name', 'unknown')
425
+
426
+ docs = extract_text_from_json(json_data, doc_id, doc_name)
427
+ all_documents.extend(docs)
428
+
429
+ except Exception as e:
430
+ log_message(f" ОШИБКА JSON {file_path}: {str(e)}")
431
+ continue
432
+
433
+ log_message(f"Всего загружено {len(all_documents)} текстовых документов")
434
+
435
+ # Chunk all documents
436
+ chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
437
+
438
+ log_message(f"После chunking: {len(chunked_documents)} чанков")
439
+ log_message("=" * 60)
440
+
441
+ return chunked_documents, chunk_info
442
+
443
+ except Exception as e:
444
+ log_message(f"❌ ОШИБКА загрузки JSON: {str(e)}")
445
+ return [], []
446
 
447
 
448
  # ============================================================================
449
+ # IMAGE DATA
450
  # ============================================================================
451
 
452
  def load_image_data(repo_id, hf_token, image_data_dir):
453
  """Load image metadata from CSV files"""
454
  log_message("=" * 60)
455
+ log_message("ЗАГРУЗКА ДАННЫХ ИЗОБРАЖЕНИЙ")
456
  log_message("=" * 60)
457
 
458
+ try:
459
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
460
+ image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
461
+
462
+ log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
463
+
464
+ image_documents = []
465
+
466
+ for file_path in image_files:
467
+ try:
468
+ local_path = hf_hub_download(
469
+ repo_id=repo_id,
470
+ filename=file_path,
471
+ local_dir='',
472
+ repo_type="dataset",
473
+ token=hf_token
474
+ )
475
+
476
+ df = pd.read_csv(local_path)
477
+ log_message(f"Загружено {len(df)} изображений из {file_path}")
478
+
479
+ for _, row in df.iterrows():
480
+ content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
481
+ content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
482
+ content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
483
+ content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
484
+ content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
485
+
486
+ doc = Document(
487
+ text=content,
488
+ metadata={
489
+ "type": "image",
490
+ "image_number": str(row.get('№ Изображения', 'unknown')),
491
+ "image_title": str(row.get('Название изображения', 'unknown')),
492
+ "document_id": str(row.get('Обозначение документа', 'unknown')),
493
+ "section": str(row.get('Раздел документа', 'unknown'))
494
+ }
495
+ )
496
+ image_documents.append(doc)
497
+
498
+ except Exception as e:
499
+ log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
500
+ continue
501
+
502
+ log_message(f"Загружено {len(image_documents)} документов изображений")
503
+ log_message("=" * 60)
504
+
505
+ return image_documents
506
+
507
+ except Exception as e:
508
+ log_message(f"❌ ОШИБКА загрузки изображений: {str(e)}")
509
+ return []
510
+
511
+
512
+ # ============================================================================
513
+ # DOCUMENT PROCESSING WITH CHUNKING
514
+ # ============================================================================
515
+
516
+ def process_documents_with_chunking(documents):
517
+ """Process all documents and chunk if needed"""
518
+ all_chunked_docs = []
519
+ chunk_info = []
520
+
521
+ stats = {
522
+ 'text_chunks': 0,
523
+ 'table_whole': 0,
524
+ 'table_chunks': 0,
525
+ 'image_whole': 0,
526
+ 'image_chunks': 0
527
+ }
528
+
529
+ for doc in documents:
530
+ doc_type = doc.metadata.get('type', 'text')
531
+ is_already_chunked = doc.metadata.get('is_chunked', False)
532
+ doc_size = len(doc.text)
533
+
534
+ # Tables - already chunked or whole
535
+ if doc_type == 'table':
536
+ if is_already_chunked:
537
+ stats['table_chunks'] += 1
538
+ else:
539
+ stats['table_whole'] += 1
540
 
541
+ all_chunked_docs.append(doc)
542
+ chunk_info.append({
543
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
544
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
545
+ 'chunk_id': doc.metadata.get('chunk_id', 0),
546
+ 'total_chunks': doc.metadata.get('total_chunks', 1),
547
+ 'chunk_size': doc_size,
548
+ 'chunk_preview': doc.text[:200] + "..." if doc_size > 200 else doc.text,
549
+ 'type': 'table',
550
+ 'table_number': doc.metadata.get('table_number', 'unknown')
551
+ })
552
+
553
+ # Images - chunk if too large
554
+ elif doc_type == 'image':
555
+ if doc_size > CHUNK_SIZE:
556
+ log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | Размер: {doc_size}")
557
+ chunked_docs = chunk_text_document(doc)
558
+ stats['image_chunks'] += len(chunked_docs)
559
+ all_chunked_docs.extend(chunked_docs)
560
 
561
+ for i, chunk_doc in enumerate(chunked_docs):
562
+ chunk_info.append({
563
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
564
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
565
+ 'chunk_id': i,
566
+ 'chunk_size': len(chunk_doc.text),
567
+ 'chunk_preview': chunk_doc.text[:200] + "...",
568
+ 'type': 'image',
569
+ 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
570
+ })
571
+ else:
572
+ stats['image_whole'] += 1
573
+ all_chunked_docs.append(doc)
574
+ chunk_info.append({
575
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
576
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
577
+ 'chunk_id': 0,
578
+ 'chunk_size': doc_size,
579
+ 'chunk_preview': doc.text[:200] + "...",
580
+ 'type': 'image',
581
+ 'image_number': doc.metadata.get('image_number', 'unknown')
582
+ })
583
 
584
+ # Text - chunk if too large
585
+ else:
586
+ if doc_size > CHUNK_SIZE:
587
+ log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | Размер: {doc_size}")
588
+ chunked_docs = chunk_text_document(doc)
589
+ stats['text_chunks'] += len(chunked_docs)
590
+ all_chunked_docs.extend(chunked_docs)
591
+
592
+ for i, chunk_doc in enumerate(chunked_docs):
593
+ chunk_info.append({
594
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
595
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
596
+ 'chunk_id': i,
597
+ 'chunk_size': len(chunk_doc.text),
598
+ 'chunk_preview': chunk_doc.text[:200] + "...",
599
+ 'type': 'text'
600
+ })
601
+ else:
602
+ all_chunked_docs.append(doc)
603
+ chunk_info.append({
604
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
605
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
606
+ 'chunk_id': 0,
607
+ 'chunk_size': doc_size,
608
+ 'chunk_preview': doc.text[:200] + "...",
609
+ 'type': 'text'
610
+ })
611
+
612
+ # Log summary
613
+ log_message(f"\n{'='*60}")
614
+ log_message("ИТОГОВАЯ СТАТИСТИКА:")
615
+ log_message(f" • Текстовые чанки: {stats['text_chunks']}")
616
+ log_message(f" • Таблицы (целые): {stats['table_whole']}")
617
+ log_message(f" • Таблицы (чанки): {stats['table_chunks']}")
618
+ log_message(f" • Изображения (целые): {stats['image_whole']}")
619
+ log_message(f" • Изображения (чанки): {stats['image_chunks']}")
620
+ log_message(f" • ВСЕГО ДОКУМЕНТОВ: {len(all_chunked_docs)}")
621
+ log_message(f"{'='*60}\n")
622
+
623
+ return all_chunked_docs, chunk_info
624
+
625
+
626
+ # ============================================================================
627
+ # CSV CHUNKS (Legacy support)
628
+ # ============================================================================
629
+
630
+ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
631
+ """Load pre-chunked data from CSV (legacy support)"""
632
+ log_message("Загрузка данных из CSV")
633
+
634
+ try:
635
+ chunks_csv_path = hf_hub_download(
636
+ repo_id=repo_id,
637
+ filename=chunks_filename,
638
+ local_dir=download_dir,
639
+ repo_type="dataset",
640
+ token=hf_token
641
+ )
642
+
643
+ chunks_df = pd.read_csv(chunks_csv_path)
644
+ log_message(f"Загружено {len(chunks_df)} чанков из CSV")
645
+
646
+ # Find text column
647
+ text_column = None
648
+ for col in chunks_df.columns:
649
+ if any(keyword in col.lower() for keyword in ['text', 'content', 'chunk']):
650
+ text_column = col
651
+ break
652
+
653
+ if text_column is None:
654
+ text_column = chunks_df.columns[0]
655
+
656
+ documents = []
657
+ for i, (_, row) in enumerate(chunks_df.iterrows()):
658
+ doc = Document(
659
+ text=str(row[text_column]),
660
+ metadata={
661
+ "chunk_id": row.get('chunk_id', i),
662
+ "document_id": row.get('document_id', 'unknown'),
663
+ "type": "text"
664
+ }
665
+ )
666
+ documents.append(doc)
667
+
668
+ log_message(f"Создано {len(documents)} документов из CSV")
669
+ return documents, chunks_df
670
+
671
+ except Exception as e:
672
+ log_message(f"❌ ОШИБКА загрузки CSV: {str(e)}")
673
+ return [], None