MrSimple07 commited on
Commit
0b34162
·
1 Parent(s): 6b0d121

simplest version

Browse files
Files changed (1) hide show
  1. documents_prep.py +56 -27
documents_prep.py CHANGED
@@ -23,54 +23,64 @@ def chunk_text_documents(documents):
23
  for i, chunk in enumerate(chunks):
24
  chunk.metadata.update({
25
  'chunk_id': i,
26
- 'total_chunks': len(chunks)
 
27
  })
28
  chunked.append(chunk)
29
 
30
- log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
 
 
 
 
 
 
 
31
  return chunked
32
 
33
 
34
- def chunk_table_by_rows(table_data, doc_id, max_rows=50):
35
  headers = table_data.get('headers', [])
36
  rows = table_data.get('data', [])
37
  table_num = table_data.get('table_number', 'unknown')
38
  table_title = table_data.get('table_title', '')
39
  section = table_data.get('section', '')
40
 
41
- table_num_clean = str(table_num).replace('№', '').strip()
 
42
 
43
  if not rows:
44
  return []
45
 
46
- if 'document_id' not in table_data:
47
- table_data['document_id'] = doc_id
48
-
49
  if len(rows) <= max_rows:
50
  content = format_table_content(table_data, headers, rows)
 
 
 
51
  return [Document(
52
  text=content,
53
  metadata={
54
  'type': 'table',
55
  'document_id': doc_id,
56
  'table_number': table_num_clean,
57
- 'table_number_original': table_num,
58
  'table_title': table_title,
59
  'section': section,
60
  'total_rows': len(rows),
 
61
  'is_complete_table': True
62
  }
63
  )]
64
 
 
65
  chunks = []
66
- overlap = 5
 
67
 
68
  for i in range(0, len(rows), max_rows - overlap):
69
  chunk_rows = rows[i:min(i+max_rows, len(rows))]
70
 
71
- chunk_info = f"Часть таблицы: строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
72
- if i > 0:
73
- chunk_info += " (с перекрытием для контекста)"
74
 
75
  content = format_table_content(
76
  table_data,
@@ -79,25 +89,31 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=50):
79
  chunk_info=chunk_info
80
  )
81
 
 
 
82
  chunks.append(Document(
83
  text=content,
84
  metadata={
85
  'type': 'table',
86
  'document_id': doc_id,
87
  'table_number': table_num_clean,
88
- 'table_number_original': table_num,
89
  'table_title': table_title,
90
  'section': section,
91
- 'chunk_id': i // (max_rows - overlap),
92
  'row_start': i,
93
  'row_end': i + len(chunk_rows),
94
  'total_rows': len(rows),
 
95
  'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
96
  'is_complete_table': False
97
  }
98
  ))
 
99
 
100
  log_message(f" 📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
 
 
 
101
  return chunks
102
 
103
 
@@ -220,16 +236,13 @@ def load_json_documents(repo_id, hf_token, json_dir):
220
  try:
221
  file_content = zf.read(json_file)
222
 
223
- if file_content.startswith(b'\xff\xfe') or file_content.startswith(b'\xfe\xff'):
224
- log_message(f" ✗ Skipping: {json_file} (appears to be UTF-16 encoded)")
225
- stats['failed'] += 1
226
- continue
227
-
228
- if not file_content.strip().startswith(b'{'):
229
- log_message(f" ✗ Skipping: {json_file} (not valid JSON)")
230
  stats['failed'] += 1
231
  continue
232
 
 
233
  try:
234
  text_content = file_content.decode('utf-8')
235
  except UnicodeDecodeError:
@@ -237,11 +250,21 @@ def load_json_documents(repo_id, hf_token, json_dir):
237
  text_content = file_content.decode('utf-8-sig')
238
  except UnicodeDecodeError:
239
  try:
240
- text_content = file_content.decode('windows-1251')
 
241
  except UnicodeDecodeError:
242
- log_message(f" ✗ Skipping: {json_file} (encoding failed)")
243
- stats['failed'] += 1
244
- continue
 
 
 
 
 
 
 
 
 
245
 
246
  with tempfile.NamedTemporaryFile(mode='w', delete=False,
247
  suffix='.json', encoding='utf-8') as tmp:
@@ -395,19 +418,25 @@ def load_image_documents(repo_id, hf_token, image_dir):
395
  content += f"Описание: {row.get('Описание изображение', '')}\n"
396
  content += f"Раздел: {row.get('Раздел документа', '')}\n"
397
 
 
 
398
  documents.append(Document(
399
  text=content,
400
  metadata={
401
  'type': 'image',
402
  'document_id': str(row.get('Обозначение документа', 'unknown')),
403
  'image_number': str(row.get('№ Изображения', 'unknown')),
404
- 'section': str(row.get('Раздел документа', ''))
 
405
  }
406
  ))
407
  except Exception as e:
408
  log_message(f"Error loading {file_path}: {e}")
409
 
410
- log_message(f"✓ Loaded {len(documents)} images")
 
 
 
411
  return documents
412
 
413
 
 
23
  for i, chunk in enumerate(chunks):
24
  chunk.metadata.update({
25
  'chunk_id': i,
26
+ 'total_chunks': len(chunks),
27
+ 'chunk_size': len(chunk.text) # Add chunk size
28
  })
29
  chunked.append(chunk)
30
 
31
+ # Log statistics
32
+ if chunked:
33
+ avg_size = sum(len(c.text) for c in chunked) / len(chunked)
34
+ min_size = min(len(c.text) for c in chunked)
35
+ max_size = max(len(c.text) for c in chunked)
36
+ log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
37
+ log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
38
+
39
  return chunked
40
 
41
 
42
+ def chunk_table_by_rows(table_data, doc_id, max_rows=10): # Reduced from 30
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
45
  table_num = table_data.get('table_number', 'unknown')
46
  table_title = table_data.get('table_title', '')
47
  section = table_data.get('section', '')
48
 
49
+ # Keep original format
50
+ table_num_clean = str(table_num).strip()
51
 
52
  if not rows:
53
  return []
54
 
55
+ # For small tables, keep as single chunk
 
 
56
  if len(rows) <= max_rows:
57
  content = format_table_content(table_data, headers, rows)
58
+ chunk_size = len(content)
59
+ log_message(f" 📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → 1 chunk ({chunk_size} chars)")
60
+
61
  return [Document(
62
  text=content,
63
  metadata={
64
  'type': 'table',
65
  'document_id': doc_id,
66
  'table_number': table_num_clean,
 
67
  'table_title': table_title,
68
  'section': section,
69
  'total_rows': len(rows),
70
+ 'chunk_size': chunk_size,
71
  'is_complete_table': True
72
  }
73
  )]
74
 
75
+ # For large tables, chunk with overlap
76
  chunks = []
77
+ overlap = 3 # Reduced overlap
78
+ chunk_num = 0
79
 
80
  for i in range(0, len(rows), max_rows - overlap):
81
  chunk_rows = rows[i:min(i+max_rows, len(rows))]
82
 
83
+ chunk_info = f"Часть {chunk_num+1}: строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
 
 
84
 
85
  content = format_table_content(
86
  table_data,
 
89
  chunk_info=chunk_info
90
  )
91
 
92
+ chunk_size = len(content)
93
+
94
  chunks.append(Document(
95
  text=content,
96
  metadata={
97
  'type': 'table',
98
  'document_id': doc_id,
99
  'table_number': table_num_clean,
 
100
  'table_title': table_title,
101
  'section': section,
102
+ 'chunk_id': chunk_num,
103
  'row_start': i,
104
  'row_end': i + len(chunk_rows),
105
  'total_rows': len(rows),
106
+ 'chunk_size': chunk_size,
107
  'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
108
  'is_complete_table': False
109
  }
110
  ))
111
+ chunk_num += 1
112
 
113
  log_message(f" 📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
114
+ for idx, chunk in enumerate(chunks):
115
+ log_message(f" Chunk {idx+1}: rows {chunk.metadata['row_start']}-{chunk.metadata['row_end']} ({chunk.metadata['chunk_size']} chars)")
116
+
117
  return chunks
118
 
119
 
 
236
  try:
237
  file_content = zf.read(json_file)
238
 
239
+ # Skip if file is too small
240
+ if len(file_content) < 10:
241
+ log_message(f" ✗ Skipping: {json_file} (file too small)")
 
 
 
 
242
  stats['failed'] += 1
243
  continue
244
 
245
+ # Try UTF-8 first (most common)
246
  try:
247
  text_content = file_content.decode('utf-8')
248
  except UnicodeDecodeError:
 
250
  text_content = file_content.decode('utf-8-sig')
251
  except UnicodeDecodeError:
252
  try:
253
+ # Try UTF-16 (the issue you're seeing)
254
+ text_content = file_content.decode('utf-16')
255
  except UnicodeDecodeError:
256
+ try:
257
+ text_content = file_content.decode('windows-1251')
258
+ except UnicodeDecodeError:
259
+ log_message(f" ✗ Skipping: {json_file} (encoding failed)")
260
+ stats['failed'] += 1
261
+ continue
262
+
263
+ # Validate JSON structure
264
+ if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
265
+ log_message(f" ✗ Skipping: {json_file} (not valid JSON)")
266
+ stats['failed'] += 1
267
+ continue
268
 
269
  with tempfile.NamedTemporaryFile(mode='w', delete=False,
270
  suffix='.json', encoding='utf-8') as tmp:
 
418
  content += f"Описание: {row.get('Описание изображение', '')}\n"
419
  content += f"Раздел: {row.get('Раздел документа', '')}\n"
420
 
421
+ chunk_size = len(content)
422
+
423
  documents.append(Document(
424
  text=content,
425
  metadata={
426
  'type': 'image',
427
  'document_id': str(row.get('Обозначение документа', 'unknown')),
428
  'image_number': str(row.get('№ Изображения', 'unknown')),
429
+ 'section': str(row.get('Раздел документа', '')),
430
+ 'chunk_size': chunk_size
431
  }
432
  ))
433
  except Exception as e:
434
  log_message(f"Error loading {file_path}: {e}")
435
 
436
+ if documents:
437
+ avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
438
+ log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
439
+
440
  return documents
441
 
442