MrSimple01 commited on
Commit
450cf87
·
verified ·
1 Parent(s): 4849803

Update documents_prep.py

Browse files
Files changed (1) hide show
  1. documents_prep.py +161 -55
documents_prep.py CHANGED
@@ -7,6 +7,72 @@ from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def chunk_text_documents(documents):
11
  text_splitter = SentenceSplitter(
12
  chunk_size=CHUNK_SIZE,
@@ -17,10 +83,13 @@ def chunk_text_documents(documents):
17
  for doc in documents:
18
  chunks = text_splitter.get_nodes_from_documents([doc])
19
  for i, chunk in enumerate(chunks):
 
 
 
20
  chunk.metadata.update({
21
  'chunk_id': i,
22
  'total_chunks': len(chunks),
23
- 'chunk_size': len(chunk.text) # Add chunk size
24
  })
25
  chunked.append(chunk)
26
 
@@ -34,20 +103,6 @@ def chunk_text_documents(documents):
34
 
35
  return chunked
36
 
37
- def normalize_text(text):
38
- if not text:
39
- return text
40
-
41
- # Replace Cyrillic 'C' with Latin 'С' (U+0421)
42
- # This is for welding types like C-25 -> С-25
43
- text = text.replace('С-', 'C')
44
-
45
- # Also handle cases like "Type C" or variations
46
- import re
47
- # Match "C" followed by digit or space in context of welding types
48
- text = re.sub(r'\bС(\d)', r'С\1', text)
49
-
50
- return text
51
 
52
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
53
  headers = table_data.get('headers', [])
@@ -55,80 +110,124 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
55
  table_num = table_data.get('table_number', 'unknown')
56
  table_title = table_data.get('table_title', '')
57
  section = table_data.get('section', '')
 
58
 
 
 
 
 
59
  table_num_clean = str(table_num).strip()
60
- table_title_normalized = normalize_text(str(table_title)) # NORMALIZE TITLE
 
 
61
 
62
  import re
63
- if 'приложени' in section.lower():
64
- appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
65
- if appendix_match:
66
- appendix_num = appendix_match.group(1).upper()
67
- table_identifier = f"{table_num_clean} Приложение {appendix_num}"
 
 
 
 
 
68
  else:
69
- table_identifier = table_num_clean
 
 
 
 
70
  else:
71
- table_identifier = table_num_clean
 
 
 
 
 
 
 
 
72
 
73
  if not rows:
74
  return []
75
 
76
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
77
 
 
 
 
 
 
 
 
 
 
78
  # Calculate base metadata size with NORMALIZED title
79
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
 
 
80
  base_size = len(base_content)
81
  available_space = max_chars - base_size - 200
82
 
83
  # If entire table fits, return as one chunk
84
- full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
85
- if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
 
 
86
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
87
 
88
  metadata = {
89
  'type': 'table',
90
  'document_id': doc_id,
91
- 'table_number': table_num_clean,
92
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE identifier
93
- 'table_title': table_title_normalized, # NORMALIZED
94
  'section': section,
95
- 'total_rows': len(rows),
 
96
  'chunk_size': len(content),
97
- 'is_complete_table': True
 
 
98
  }
99
 
100
- log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
101
  return [Document(text=content, metadata=metadata)]
102
 
 
103
  chunks = []
104
  current_rows = []
105
  current_size = 0
106
  chunk_num = 0
107
 
108
- for i, row in enumerate(rows):
109
  row_text = format_single_row(row, i + 1)
110
  row_size = len(row_text)
111
 
112
- should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
 
113
 
114
  if should_split:
115
  content = base_content + format_table_rows(current_rows)
116
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
117
  content += format_table_footer(table_identifier, doc_id)
118
 
119
  metadata = {
120
  'type': 'table',
121
  'document_id': doc_id,
122
- 'table_number': table_num_clean,
123
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE
124
- 'table_title': table_title_normalized, # NORMALIZED
125
  'section': section,
 
126
  'chunk_id': chunk_num,
127
  'row_start': current_rows[0]['_idx'] - 1,
128
  'row_end': current_rows[-1]['_idx'],
129
- 'total_rows': len(rows),
130
  'chunk_size': len(content),
131
- 'is_complete_table': False
 
132
  }
133
 
134
  chunks.append(Document(text=content, metadata=metadata))
@@ -138,31 +237,32 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
138
  current_rows = []
139
  current_size = 0
140
 
141
- # Add row with index
142
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
143
  row_copy['_idx'] = i + 1
144
  current_rows.append(row_copy)
145
  current_size += row_size
146
 
147
- # Add final chunk
148
  if current_rows:
149
  content = base_content + format_table_rows(current_rows)
150
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
151
  content += format_table_footer(table_identifier, doc_id)
152
 
153
  metadata = {
154
  'type': 'table',
155
  'document_id': doc_id,
156
- 'table_number': table_num_clean,
157
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE
158
- 'table_title': table_title_normalized, # NORMALIZED
159
  'section': section,
 
160
  'chunk_id': chunk_num,
161
  'row_start': current_rows[0]['_idx'] - 1,
162
  'row_end': current_rows[-1]['_idx'],
163
- 'total_rows': len(rows),
164
  'chunk_size': len(content),
165
- 'is_complete_table': False
 
166
  }
167
 
168
  chunks.append(Document(text=content, metadata=metadata))
@@ -171,13 +271,15 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
171
  return chunks
172
 
173
 
174
- # MODIFIED: Update format_table_header function
175
- def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
176
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
177
 
178
- # Add table type/number prominently for matching
179
- if table_num:
180
- content += f"ТИП: {normalize_text(table_num)}\n"
 
 
 
181
 
182
  if table_title:
183
  content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
@@ -185,16 +287,20 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
185
  if section:
186
  content += f"РАЗДЕЛ: {section}\n"
187
 
 
 
 
188
  content += f"{'='*70}\n"
189
 
190
  if headers:
191
- header_str = ' | '.join(str(h) for h in headers)
 
 
192
  content += f"ЗАГОЛОВКИ: {header_str}\n\n"
193
 
194
  content += "ДАННЫЕ:\n"
195
  return content
196
 
197
-
198
  def format_single_row(row, idx):
199
  """Format a single row"""
200
  if isinstance(row, dict):
 
7
  from my_logging import log_message
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
9
 
10
+ def normalize_text(text):
11
+ if not text:
12
+ return text
13
+
14
+ # Replace Cyrillic 'C' with Latin 'С' (U+0421)
15
+ # This is for welding types like C-25 -> С-25
16
+ text = text.replace('С-', 'C')
17
+
18
+ # Also handle cases like "Type C" or variations
19
+ import re
20
+ # Match "C" followed by digit or space in context of welding types
21
+ text = re.sub(r'\bС(\d)', r'С\1', text)
22
+
23
+ return text
24
+
25
+ def normalize_steel_designations(text):
26
+ """
27
+ Convert Latin letters to Cyrillic in steel designations.
28
+ Only applies to specific patterns to avoid changing legitimate Latin text.
29
+ """
30
+ if not text:
31
+ return text
32
+
33
+ import re
34
+
35
+ # Pattern 1: Steel grades like 08X18H10T, 12X18H10T, etc.
36
+ # Format: digits + Latin letters (no spaces typically)
37
+ # Common steel designation pattern: [\d]+[XHTKBMCAP]+[\d]*[XHTKBMCAP]*
38
+
39
+ def replace_in_steel_grade(match):
40
+ """Replace Latin with Cyrillic only in steel grade context"""
41
+ grade = match.group(0)
42
+ # Mapping of Latin to Cyrillic for steel designations
43
+ replacements = {
44
+ 'X': 'Х', # Latin X -> Cyrillic Х (Kha)
45
+ 'H': 'Н', # Latin H -> Cyrillic Н (En)
46
+ 'T': 'Т', # Latin T -> Cyrillic Т (Te)
47
+ 'C': 'С', # Latin C -> Cyrillic С (Es)
48
+ 'B': 'В', # Latin B -> Cyrillic В (Ve)
49
+ 'K': 'К', # Latin K -> Cyrillic К (Ka)
50
+ 'M': 'М', # Latin M -> Cyrillic М (Em)
51
+ 'A': 'А', # Latin A -> Cyrillic А (A)
52
+ 'P': 'Р', # Latin P -> Cyrillic Р (Er)
53
+ }
54
+ for latin, cyrillic in replacements.items():
55
+ grade = grade.replace(latin, cyrillic)
56
+ return grade
57
+
58
+ # Pattern for steel grades: digits followed by letters and more digits/letters
59
+ # Examples: 08X18H10T, 12X18H10T, 20X13, etc.
60
+ text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
61
+ replace_in_steel_grade, text)
62
+
63
+ # Pattern 2: Welding wire designations like CB-08X19H10, CB-10XH25T
64
+ text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
65
+ replace_in_steel_grade, text)
66
+
67
+ # Pattern 3: Welding consumables like C-25, C-26 (but be careful not to change section refs)
68
+ # Only replace if followed by dash and digits
69
+ text = re.sub(r'\b[C]-\d{1,2}\b',
70
+ lambda m: m.group(0).replace('C', 'С'), text)
71
+
72
+ return text
73
+
74
+
75
+
76
  def chunk_text_documents(documents):
77
  text_splitter = SentenceSplitter(
78
  chunk_size=CHUNK_SIZE,
 
83
  for doc in documents:
84
  chunks = text_splitter.get_nodes_from_documents([doc])
85
  for i, chunk in enumerate(chunks):
86
+ # Normalize steel designations in the chunk text
87
+ chunk.text = normalize_steel_designations(chunk.text)
88
+
89
  chunk.metadata.update({
90
  'chunk_id': i,
91
  'total_chunks': len(chunks),
92
+ 'chunk_size': len(chunk.text)
93
  })
94
  chunked.append(chunk)
95
 
 
103
 
104
  return chunked
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
108
  headers = table_data.get('headers', [])
 
110
  table_num = table_data.get('table_number', 'unknown')
111
  table_title = table_data.get('table_title', '')
112
  section = table_data.get('section', '')
113
+ sheet_name = table_data.get('sheet_name', '')
114
 
115
+ # Apply steel designation normalization to title and section
116
+ table_title = normalize_steel_designations(str(table_title))
117
+ section = normalize_steel_designations(section)
118
+
119
  table_num_clean = str(table_num).strip()
120
+ table_title_normalized = normalize_text(str(table_title))
121
+
122
+ import re
123
 
124
  import re
125
+
126
+ if table_num_clean in ['-', '', 'unknown', 'nan']:
127
+ if 'приложени' in sheet_name.lower() or 'приложени' in section.lower():
128
+ appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)',
129
+ (sheet_name + ' ' + section).lower())
130
+ if appendix_match:
131
+ appendix_num = appendix_match.group(1)
132
+ table_identifier = f"Приложение {appendix_num}"
133
+ else:
134
+ table_identifier = "Приложение"
135
  else:
136
+ if table_title:
137
+ first_words = ' '.join(table_title.split()[:5])
138
+ table_identifier = f"{first_words}"
139
+ else:
140
+ table_identifier = section.split(',')[0] if section else "БезНомера"
141
  else:
142
+ if 'приложени' in section.lower():
143
+ appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)', section.lower())
144
+ if appendix_match:
145
+ appendix_num = appendix_match.group(1)
146
+ table_identifier = f"{table_num_clean} Приложение {appendix_num}"
147
+ else:
148
+ table_identifier = table_num_clean
149
+ else:
150
+ table_identifier = table_num_clean
151
 
152
  if not rows:
153
  return []
154
 
155
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
156
 
157
+ # Normalize all row content (including steel designations)
158
+ normalized_rows = []
159
+ for row in rows:
160
+ if isinstance(row, dict):
161
+ normalized_row = {k: normalize_steel_designations(str(v)) for k, v in row.items()}
162
+ normalized_rows.append(normalized_row)
163
+ else:
164
+ normalized_rows.append(row)
165
+
166
  # Calculate base metadata size with NORMALIZED title
167
+ base_content = format_table_header(doc_id, table_identifier, table_num,
168
+ table_title_normalized, section, headers,
169
+ sheet_name) # Pass sheet_name
170
  base_size = len(base_content)
171
  available_space = max_chars - base_size - 200
172
 
173
  # If entire table fits, return as one chunk
174
+ full_rows_content = format_table_rows([{**row, '_idx': i+1}
175
+ for i, row in enumerate(normalized_rows)])
176
+
177
+ if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
178
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
179
 
180
  metadata = {
181
  'type': 'table',
182
  'document_id': doc_id,
183
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
184
+ 'table_identifier': normalize_text(table_identifier),
185
+ 'table_title': table_title_normalized,
186
  'section': section,
187
+ 'sheet_name': sheet_name, # ADD THIS
188
+ 'total_rows': len(normalized_rows),
189
  'chunk_size': len(content),
190
+ 'is_complete_table': True,
191
+ # ADD SEARCHABLE KEYWORDS
192
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
193
  }
194
 
195
+ log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
196
  return [Document(text=content, metadata=metadata)]
197
 
198
+ # Chunking logic continues with normalized_rows instead of rows...
199
  chunks = []
200
  current_rows = []
201
  current_size = 0
202
  chunk_num = 0
203
 
204
+ for i, row in enumerate(normalized_rows):
205
  row_text = format_single_row(row, i + 1)
206
  row_size = len(row_text)
207
 
208
+ should_split = (current_size + row_size > available_space or
209
+ len(current_rows) >= max_rows) and current_rows
210
 
211
  if should_split:
212
  content = base_content + format_table_rows(current_rows)
213
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
214
  content += format_table_footer(table_identifier, doc_id)
215
 
216
  metadata = {
217
  'type': 'table',
218
  'document_id': doc_id,
219
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
220
+ 'table_identifier': normalize_text(table_identifier),
221
+ 'table_title': table_title_normalized,
222
  'section': section,
223
+ 'sheet_name': sheet_name,
224
  'chunk_id': chunk_num,
225
  'row_start': current_rows[0]['_idx'] - 1,
226
  'row_end': current_rows[-1]['_idx'],
227
+ 'total_rows': len(normalized_rows),
228
  'chunk_size': len(content),
229
+ 'is_complete_table': False,
230
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
231
  }
232
 
233
  chunks.append(Document(text=content, metadata=metadata))
 
237
  current_rows = []
238
  current_size = 0
239
 
 
240
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
241
  row_copy['_idx'] = i + 1
242
  current_rows.append(row_copy)
243
  current_size += row_size
244
 
245
+ # Final chunk
246
  if current_rows:
247
  content = base_content + format_table_rows(current_rows)
248
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
249
  content += format_table_footer(table_identifier, doc_id)
250
 
251
  metadata = {
252
  'type': 'table',
253
  'document_id': doc_id,
254
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
255
+ 'table_identifier': normalize_text(table_identifier),
256
+ 'table_title': table_title_normalized,
257
  'section': section,
258
+ 'sheet_name': sheet_name,
259
  'chunk_id': chunk_num,
260
  'row_start': current_rows[0]['_idx'] - 1,
261
  'row_end': current_rows[-1]['_idx'],
262
+ 'total_rows': len(normalized_rows),
263
  'chunk_size': len(content),
264
+ 'is_complete_table': False,
265
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
266
  }
267
 
268
  chunks.append(Document(text=content, metadata=metadata))
 
271
  return chunks
272
 
273
 
274
+ def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
 
275
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
276
 
277
+ # Add multiple searchable identifiers
278
+ if table_num and table_num not in ['-', 'unknown']:
279
+ content += f"НОМЕР ТАБЛИЦЫ: {normalize_text(table_num)}\n"
280
+
281
+ if sheet_name:
282
+ content += f"ЛИСТ: {sheet_name}\n"
283
 
284
  if table_title:
285
  content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
 
287
  if section:
288
  content += f"РАЗДЕЛ: {section}\n"
289
 
290
+ # ADD KEYWORDS for better retrieval
291
+ content += f"КЛЮЧЕВЫЕ СЛОВА: материалы стали марки стандарты {doc_id}\n"
292
+
293
  content += f"{'='*70}\n"
294
 
295
  if headers:
296
+ # Normalize headers too
297
+ normalized_headers = [normalize_text(str(h)) for h in headers]
298
+ header_str = ' | '.join(normalized_headers)
299
  content += f"ЗАГОЛОВКИ: {header_str}\n\n"
300
 
301
  content += "ДАННЫЕ:\n"
302
  return content
303
 
 
304
  def format_single_row(row, idx):
305
  """Format a single row"""
306
  if isinstance(row, dict):