MrSimple01 commited on
Commit
abb0a7b
·
verified ·
1 Parent(s): 73cef4b

Update documents_prep.py

Browse files
Files changed (1) hide show
  1. documents_prep.py +73 -27
documents_prep.py CHANGED
@@ -26,19 +26,21 @@ def normalize_steel_designations(text):
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
  Only applies to specific patterns to avoid changing legitimate Latin text.
 
29
  """
30
  if not text:
31
- return text
32
 
33
  import re
34
 
35
- # Pattern 1: Steel grades like 08X18H10T, 12X18H10T, etc.
36
- # Format: digits + Latin letters (no spaces typically)
37
- # Common steel designation pattern: [\d]+[XHTKBMCAP]+[\d]*[XHTKBMCAP]*
38
 
39
  def replace_in_steel_grade(match):
40
  """Replace Latin with Cyrillic only in steel grade context"""
 
41
  grade = match.group(0)
 
 
42
  # Mapping of Latin to Cyrillic for steel designations
43
  replacements = {
44
  'X': 'Х', # Latin X -> Cyrillic Х (Kha)
@@ -53,6 +55,10 @@ def normalize_steel_designations(text):
53
  }
54
  for latin, cyrillic in replacements.items():
55
  grade = grade.replace(latin, cyrillic)
 
 
 
 
56
  return grade
57
 
58
  # Pattern for steel grades: digits followed by letters and more digits/letters
@@ -69,7 +75,7 @@ def normalize_steel_designations(text):
69
  text = re.sub(r'\b[C]-\d{1,2}\b',
70
  lambda m: m.group(0).replace('C', 'С'), text)
71
 
72
- return text
73
 
74
 
75
 
@@ -79,12 +85,23 @@ def chunk_text_documents(documents):
79
  chunk_overlap=CHUNK_OVERLAP
80
  )
81
 
 
 
 
 
 
 
82
  chunked = []
83
  for doc in documents:
84
  chunks = text_splitter.get_nodes_from_documents([doc])
85
  for i, chunk in enumerate(chunks):
86
  # Normalize steel designations in the chunk text
87
- chunk.text = normalize_steel_designations(chunk.text)
 
 
 
 
 
88
 
89
  chunk.metadata.update({
90
  'chunk_id': i,
@@ -100,6 +117,12 @@ def chunk_text_documents(documents):
100
  max_size = max(len(c.text) for c in chunked)
101
  log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
102
  log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
 
 
 
 
 
 
103
 
104
  return chunked
105
 
@@ -113,13 +136,10 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
113
  sheet_name = table_data.get('sheet_name', '')
114
 
115
  # Apply steel designation normalization to title and section
116
- table_title = normalize_steel_designations(str(table_title))
117
- section = normalize_steel_designations(section)
118
 
119
  table_num_clean = str(table_num).strip()
120
- table_title_normalized = normalize_text(str(table_title))
121
-
122
- import re
123
 
124
  import re
125
 
@@ -156,17 +176,35 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
156
 
157
  # Normalize all row content (including steel designations)
158
  normalized_rows = []
 
 
 
159
  for row in rows:
160
  if isinstance(row, dict):
161
- normalized_row = {k: normalize_steel_designations(str(v)) for k, v in row.items()}
 
 
 
 
 
 
 
 
 
162
  normalized_rows.append(normalized_row)
163
  else:
164
  normalized_rows.append(row)
165
 
166
- # Calculate base metadata size with NORMALIZED title
 
 
 
 
 
 
167
  base_content = format_table_header(doc_id, table_identifier, table_num,
168
- table_title_normalized, section, headers,
169
- sheet_name) # Pass sheet_name
170
  base_size = len(base_content)
171
  available_space = max_chars - base_size - 200
172
 
@@ -181,21 +219,20 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
181
  'type': 'table',
182
  'document_id': doc_id,
183
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
184
- 'table_identifier': normalize_text(table_identifier),
185
- 'table_title': table_title_normalized,
186
  'section': section,
187
- 'sheet_name': sheet_name, # ADD THIS
188
  'total_rows': len(normalized_rows),
189
  'chunk_size': len(content),
190
  'is_complete_table': True,
191
- # ADD SEARCHABLE KEYWORDS
192
  'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
193
  }
194
 
195
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
196
  return [Document(text=content, metadata=metadata)]
197
 
198
- # Chunking logic continues with normalized_rows instead of rows...
199
  chunks = []
200
  current_rows = []
201
  current_size = 0
@@ -217,8 +254,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
217
  'type': 'table',
218
  'document_id': doc_id,
219
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
220
- 'table_identifier': normalize_text(table_identifier),
221
- 'table_title': table_title_normalized,
222
  'section': section,
223
  'sheet_name': sheet_name,
224
  'chunk_id': chunk_num,
@@ -252,8 +289,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
252
  'type': 'table',
253
  'document_id': doc_id,
254
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
255
- 'table_identifier': normalize_text(table_identifier),
256
- 'table_title': table_title_normalized,
257
  'section': section,
258
  'sheet_name': sheet_name,
259
  'chunk_id': chunk_num,
@@ -271,6 +308,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
271
  return chunks
272
 
273
 
 
274
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
275
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
276
 
@@ -510,11 +548,15 @@ def extract_sections_from_json(json_path):
510
 
511
  def load_table_documents(repo_id, hf_token, table_dir):
512
  log_message("Loading tables...")
 
 
513
 
514
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
515
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
516
 
517
  all_chunks = []
 
 
518
  for file_path in table_files:
519
  try:
520
  local_path = hf_hub_download(
@@ -531,15 +573,19 @@ def load_table_documents(repo_id, hf_token, table_dir):
531
 
532
  for sheet in data.get('sheets', []):
533
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
 
534
 
535
- # Use the consistent MAX_CHARS_TABLE from config
536
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
 
537
  all_chunks.extend(chunks)
538
 
539
  except Exception as e:
540
  log_message(f"Error loading {file_path}: {e}")
541
 
542
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
 
 
543
  return all_chunks
544
 
545
 
 
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
  Only applies to specific patterns to avoid changing legitimate Latin text.
29
+ Returns: (normalized_text, changes_count)
30
  """
31
  if not text:
32
+ return text, 0
33
 
34
  import re
35
 
36
+ changes_count = 0
 
 
37
 
38
  def replace_in_steel_grade(match):
39
  """Replace Latin with Cyrillic only in steel grade context"""
40
+ nonlocal changes_count
41
  grade = match.group(0)
42
+ original_grade = grade
43
+
44
  # Mapping of Latin to Cyrillic for steel designations
45
  replacements = {
46
  'X': 'Х', # Latin X -> Cyrillic Х (Kha)
 
55
  }
56
  for latin, cyrillic in replacements.items():
57
  grade = grade.replace(latin, cyrillic)
58
+
59
+ if grade != original_grade:
60
+ changes_count += 1
61
+
62
  return grade
63
 
64
  # Pattern for steel grades: digits followed by letters and more digits/letters
 
75
  text = re.sub(r'\b[C]-\d{1,2}\b',
76
  lambda m: m.group(0).replace('C', 'С'), text)
77
 
78
+ return text, changes_count
79
 
80
 
81
 
 
85
  chunk_overlap=CHUNK_OVERLAP
86
  )
87
 
88
+ log_message("="*60)
89
+ log_message("NORMALIZING STEEL DESIGNATIONS IN TEXT CHUNKS")
90
+
91
+ total_normalizations = 0
92
+ chunks_with_changes = 0
93
+
94
  chunked = []
95
  for doc in documents:
96
  chunks = text_splitter.get_nodes_from_documents([doc])
97
  for i, chunk in enumerate(chunks):
98
  # Normalize steel designations in the chunk text
99
+ original_text = chunk.text
100
+ chunk.text, changes = normalize_steel_designations(chunk.text)
101
+
102
+ if changes > 0:
103
+ chunks_with_changes += 1
104
+ total_normalizations += changes
105
 
106
  chunk.metadata.update({
107
  'chunk_id': i,
 
117
  max_size = max(len(c.text) for c in chunked)
118
  log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
119
  log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
120
+ log_message(f" Steel designation normalization:")
121
+ log_message(f" - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
122
+ log_message(f" - Total steel grades normalized: {total_normalizations}")
123
+ log_message(f" - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else " - No normalizations needed")
124
+
125
+ log_message("="*60)
126
 
127
  return chunked
128
 
 
136
  sheet_name = table_data.get('sheet_name', '')
137
 
138
  # Apply steel designation normalization to title and section
139
+ table_title, title_changes = normalize_steel_designations(str(table_title))
140
+ section, section_changes = normalize_steel_designations(section)
141
 
142
  table_num_clean = str(table_num).strip()
 
 
 
143
 
144
  import re
145
 
 
176
 
177
  # Normalize all row content (including steel designations)
178
  normalized_rows = []
179
+ total_row_changes = 0
180
+ rows_with_changes = 0
181
+
182
  for row in rows:
183
  if isinstance(row, dict):
184
+ normalized_row = {}
185
+ row_had_changes = False
186
+ for k, v in row.items():
187
+ normalized_val, changes = normalize_steel_designations(str(v))
188
+ normalized_row[k] = normalized_val
189
+ if changes > 0:
190
+ total_row_changes += changes
191
+ row_had_changes = True
192
+ if row_had_changes:
193
+ rows_with_changes += 1
194
  normalized_rows.append(normalized_row)
195
  else:
196
  normalized_rows.append(row)
197
 
198
+ # Log normalization stats for this table
199
+ if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
200
+ log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
201
+ f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
202
+
203
+ # Continue with rest of existing logic using normalized_rows...
204
+ # Calculate base metadata size
205
  base_content = format_table_header(doc_id, table_identifier, table_num,
206
+ table_title, section, headers,
207
+ sheet_name)
208
  base_size = len(base_content)
209
  available_space = max_chars - base_size - 200
210
 
 
219
  'type': 'table',
220
  'document_id': doc_id,
221
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
222
+ 'table_identifier': table_identifier,
223
+ 'table_title': table_title,
224
  'section': section,
225
+ 'sheet_name': sheet_name,
226
  'total_rows': len(normalized_rows),
227
  'chunk_size': len(content),
228
  'is_complete_table': True,
 
229
  'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
230
  }
231
 
232
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
233
  return [Document(text=content, metadata=metadata)]
234
 
235
+ # Chunking logic continues...
236
  chunks = []
237
  current_rows = []
238
  current_size = 0
 
254
  'type': 'table',
255
  'document_id': doc_id,
256
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
257
+ 'table_identifier': table_identifier,
258
+ 'table_title': table_title,
259
  'section': section,
260
  'sheet_name': sheet_name,
261
  'chunk_id': chunk_num,
 
289
  'type': 'table',
290
  'document_id': doc_id,
291
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
292
+ 'table_identifier': table_identifier,
293
+ 'table_title': table_title,
294
  'section': section,
295
  'sheet_name': sheet_name,
296
  'chunk_id': chunk_num,
 
308
  return chunks
309
 
310
 
311
+
312
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
313
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
314
 
 
548
 
549
  def load_table_documents(repo_id, hf_token, table_dir):
550
  log_message("Loading tables...")
551
+ log_message("="*60)
552
+ log_message("NORMALIZING STEEL DESIGNATIONS IN TABLES")
553
 
554
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
555
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
556
 
557
  all_chunks = []
558
+ tables_processed = 0
559
+
560
  for file_path in table_files:
561
  try:
562
  local_path = hf_hub_download(
 
573
 
574
  for sheet in data.get('sheets', []):
575
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
576
+ tables_processed += 1
577
 
578
+ chunks = chunk_table_by_content(sheet, sheet_doc_id,
579
+ max_chars=MAX_CHARS_TABLE,
580
+ max_rows=MAX_ROWS_TABLE)
581
  all_chunks.extend(chunks)
582
 
583
  except Exception as e:
584
  log_message(f"Error loading {file_path}: {e}")
585
 
586
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
587
+ log_message("="*60)
588
+
589
  return all_chunks
590
 
591