MrSimple07 commited on
Commit
9ce9909
·
1 Parent(s): fbed18d

added a new loggers for normalizations

Browse files
checking_cosine.py CHANGED
@@ -1,6 +1,7 @@
1
  import numpy as np
2
  from sentence_transformers import SentenceTransformer, util
3
  from datetime import datetime
 
4
 
5
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
6
  QUERY = "по каким стандартам может быть применена сталь 08X18H10T?"
@@ -77,11 +78,9 @@ CHUNK_3000_30="""
77
  """
78
 
79
 
80
- import re
81
-
82
  mapping = {
83
- 'X': 'Х', 'H': 'Н', 'T': 'Т', 'C': 'С', 'B': 'В', 'K': 'К', 'M': 'М', 'A': 'А', 'R': 'Р',
84
- 'x': 'х', 'h': 'н', 't': 'т', 'c': 'с', 'b': 'в', 'k': 'к', 'm': 'м', 'a': 'а', 'r': 'р'
85
  }
86
  token_re = re.compile(r'\b[0-9A-Za-zА-Яа-яЁё\-\+_/\.]+\b')
87
 
@@ -102,16 +101,6 @@ def replace_latin_in_steel_tokens(text):
102
  return token
103
  return token_re.sub(repl_token, text)
104
 
105
- # Пример использования:
106
- chunk_fixed = replace_latin_in_steel_tokens(CHUNK_FULL)
107
- chunk_fixed_2 = replace_latin_in_steel_tokens(CHUNK_SHORT)
108
- chunk_fixed_3 = replace_latin_in_steel_tokens(CHUNK_3000_30)
109
- chunk_fixed_4 = replace_latin_in_steel_tokens(CHUNK_FULL)
110
- query_fixed = replace_latin_in_steel_tokens(QUERY)
111
- # затем model.encode([query_fixed, chunk_fixed, ...])
112
-
113
-
114
-
115
  def cosine_similarity(a, b):
116
  return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
117
 
@@ -119,20 +108,23 @@ def main():
119
  model = SentenceTransformer(EMBEDDING_MODEL)
120
  print(f"🔹 Loaded embedding model: {EMBEDDING_MODEL}\n")
121
 
122
- # Encode all texts
 
 
 
 
123
  embeddings = model.encode([query_fixed, chunk_fixed, chunk_fixed_2, chunk_fixed_3])
124
- query_emb, full_emb, short_emb, sim_3000_30 = embeddings
125
 
126
- # Compute cosine similarities
127
  sim_full = cosine_similarity(query_emb, full_emb)
128
  sim_short = cosine_similarity(query_emb, short_emb)
129
- sim_3000_30 = cosine_similarity(query_emb, sim_3000_30)
130
 
131
  timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
132
  result_text = (
133
  f"Запрос: {QUERY}\n\n"
134
  f"Сходство (полный чанк): {sim_full:.4f}\n"
135
- f"Сходство (сокращённый чанк): {sim_short:.4f}\n\n"
136
  f"Сходство (чанк 3000 символов, 30 строк): {sim_3000_30:.4f}\n\n"
137
  f"Вывод: {'Сокращённый чанк ближе к запросу' if sim_short > sim_full else 'Полный чанк ближе к запросу'}\n"
138
  )
@@ -144,8 +136,5 @@ def main():
144
  print(result_text)
145
  print(f"✅ Результаты сохранены в файл: {output_file}")
146
 
147
- # ===============================================================
148
- # ENTRY POINT
149
- # ===============================================================
150
  if __name__ == "__main__":
151
- main()
 
1
  import numpy as np
2
  from sentence_transformers import SentenceTransformer, util
3
  from datetime import datetime
4
+ import re
5
 
6
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
7
  QUERY = "по каким стандартам может быть применена сталь 08X18H10T?"
 
78
  """
79
 
80
 
 
 
81
  mapping = {
82
+ 'X': 'Х', 'H': 'Н', 'T': 'Т', 'C': 'С', 'B': 'В', 'K': 'К', 'M': 'М', 'A': 'А', 'R': 'Р', 'P': 'Р',
83
+ 'x': 'х', 'h': 'н', 't': 'т', 'c': 'с', 'b': 'в', 'k': 'к', 'm': 'м', 'a': 'а', 'r': 'р', 'p': 'р'
84
  }
85
  token_re = re.compile(r'\b[0-9A-Za-zА-Яа-яЁё\-\+_/\.]+\b')
86
 
 
101
  return token
102
  return token_re.sub(repl_token, text)
103
 
 
 
 
 
 
 
 
 
 
 
104
  def cosine_similarity(a, b):
105
  return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
106
 
 
108
  model = SentenceTransformer(EMBEDDING_MODEL)
109
  print(f"🔹 Loaded embedding model: {EMBEDDING_MODEL}\n")
110
 
111
+ query_fixed = replace_latin_in_steel_tokens(QUERY)
112
+ chunk_fixed = replace_latin_in_steel_tokens(CHUNK_FULL)
113
+ chunk_fixed_2 = replace_latin_in_steel_tokens(CHUNK_SHORT)
114
+ chunk_fixed_3 = replace_latin_in_steel_tokens(CHUNK_3000_30)
115
+
116
  embeddings = model.encode([query_fixed, chunk_fixed, chunk_fixed_2, chunk_fixed_3])
117
+ query_emb, full_emb, short_emb, chunk_3000_emb = embeddings
118
 
 
119
  sim_full = cosine_similarity(query_emb, full_emb)
120
  sim_short = cosine_similarity(query_emb, short_emb)
121
+ sim_3000_30 = cosine_similarity(query_emb, chunk_3000_emb)
122
 
123
  timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
124
  result_text = (
125
  f"Запрос: {QUERY}\n\n"
126
  f"Сходство (полный чанк): {sim_full:.4f}\n"
127
+ f"Сходство (сокращённый чанк): {sim_short:.4f}\n"
128
  f"Сходство (чанк 3000 символов, 30 строк): {sim_3000_30:.4f}\n\n"
129
  f"Вывод: {'Сокращённый чанк ближе к запросу' if sim_short > sim_full else 'Полный чанк ближе к запросу'}\n"
130
  )
 
136
  print(result_text)
137
  print(f"✅ Результаты сохранены в файл: {output_file}")
138
 
 
 
 
139
  if __name__ == "__main__":
140
+ main()
chunk_similarity_results_2025-10-15_13-26-33.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Запрос: по каким стандартам может быть применена сталь 08X18H10T?
2
+
3
+ Сходство (полный чанк): 0.5152
4
+ Сходство (сокращённый чанк): 0.5219
5
+ Сходство (чанк 3000 символов, 30 строк): 0.5152
6
+
7
+ Вывод: Сокращённый чанк ближе к запросу
documents_prep.py CHANGED
@@ -26,19 +26,21 @@ def normalize_steel_designations(text):
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
  Only applies to specific patterns to avoid changing legitimate Latin text.
 
29
  """
30
  if not text:
31
- return text
32
 
33
  import re
34
 
35
- # Pattern 1: Steel grades like 08X18H10T, 12X18H10T, etc.
36
- # Format: digits + Latin letters (no spaces typically)
37
- # Common steel designation pattern: [\d]+[XHTKBMCAP]+[\d]*[XHTKBMCAP]*
38
 
39
  def replace_in_steel_grade(match):
40
  """Replace Latin with Cyrillic only in steel grade context"""
 
41
  grade = match.group(0)
 
 
42
  # Mapping of Latin to Cyrillic for steel designations
43
  replacements = {
44
  'X': 'Х', # Latin X -> Cyrillic Х (Kha)
@@ -53,6 +55,10 @@ def normalize_steel_designations(text):
53
  }
54
  for latin, cyrillic in replacements.items():
55
  grade = grade.replace(latin, cyrillic)
 
 
 
 
56
  return grade
57
 
58
  # Pattern for steel grades: digits followed by letters and more digits/letters
@@ -69,7 +75,7 @@ def normalize_steel_designations(text):
69
  text = re.sub(r'\b[C]-\d{1,2}\b',
70
  lambda m: m.group(0).replace('C', 'С'), text)
71
 
72
- return text
73
 
74
 
75
 
@@ -79,12 +85,23 @@ def chunk_text_documents(documents):
79
  chunk_overlap=CHUNK_OVERLAP
80
  )
81
 
 
 
 
 
 
 
82
  chunked = []
83
  for doc in documents:
84
  chunks = text_splitter.get_nodes_from_documents([doc])
85
  for i, chunk in enumerate(chunks):
86
  # Normalize steel designations in the chunk text
87
- chunk.text = normalize_steel_designations(chunk.text)
 
 
 
 
 
88
 
89
  chunk.metadata.update({
90
  'chunk_id': i,
@@ -100,6 +117,12 @@ def chunk_text_documents(documents):
100
  max_size = max(len(c.text) for c in chunked)
101
  log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
102
  log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
 
 
 
 
 
 
103
 
104
  return chunked
105
 
@@ -113,13 +136,10 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
113
  sheet_name = table_data.get('sheet_name', '')
114
 
115
  # Apply steel designation normalization to title and section
116
- table_title = normalize_steel_designations(str(table_title))
117
- section = normalize_steel_designations(section)
118
 
119
  table_num_clean = str(table_num).strip()
120
- table_title_normalized = normalize_text(str(table_title))
121
-
122
- import re
123
 
124
  import re
125
 
@@ -156,17 +176,35 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
156
 
157
  # Normalize all row content (including steel designations)
158
  normalized_rows = []
 
 
 
159
  for row in rows:
160
  if isinstance(row, dict):
161
- normalized_row = {k: normalize_steel_designations(str(v)) for k, v in row.items()}
 
 
 
 
 
 
 
 
 
162
  normalized_rows.append(normalized_row)
163
  else:
164
  normalized_rows.append(row)
165
 
166
- # Calculate base metadata size with NORMALIZED title
 
 
 
 
 
 
167
  base_content = format_table_header(doc_id, table_identifier, table_num,
168
- table_title_normalized, section, headers,
169
- sheet_name) # Pass sheet_name
170
  base_size = len(base_content)
171
  available_space = max_chars - base_size - 200
172
 
@@ -181,21 +219,20 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
181
  'type': 'table',
182
  'document_id': doc_id,
183
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
184
- 'table_identifier': normalize_text(table_identifier),
185
- 'table_title': table_title_normalized,
186
  'section': section,
187
- 'sheet_name': sheet_name, # ADD THIS
188
  'total_rows': len(normalized_rows),
189
  'chunk_size': len(content),
190
  'is_complete_table': True,
191
- # ADD SEARCHABLE KEYWORDS
192
  'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
193
  }
194
 
195
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
196
  return [Document(text=content, metadata=metadata)]
197
 
198
- # Chunking logic continues with normalized_rows instead of rows...
199
  chunks = []
200
  current_rows = []
201
  current_size = 0
@@ -217,8 +254,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
217
  'type': 'table',
218
  'document_id': doc_id,
219
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
220
- 'table_identifier': normalize_text(table_identifier),
221
- 'table_title': table_title_normalized,
222
  'section': section,
223
  'sheet_name': sheet_name,
224
  'chunk_id': chunk_num,
@@ -252,8 +289,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
252
  'type': 'table',
253
  'document_id': doc_id,
254
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
255
- 'table_identifier': normalize_text(table_identifier),
256
- 'table_title': table_title_normalized,
257
  'section': section,
258
  'sheet_name': sheet_name,
259
  'chunk_id': chunk_num,
@@ -271,6 +308,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
271
  return chunks
272
 
273
 
 
274
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
275
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
276
 
@@ -510,11 +548,15 @@ def extract_sections_from_json(json_path):
510
 
511
  def load_table_documents(repo_id, hf_token, table_dir):
512
  log_message("Loading tables...")
 
 
513
 
514
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
515
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
516
 
517
  all_chunks = []
 
 
518
  for file_path in table_files:
519
  try:
520
  local_path = hf_hub_download(
@@ -531,15 +573,19 @@ def load_table_documents(repo_id, hf_token, table_dir):
531
 
532
  for sheet in data.get('sheets', []):
533
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
 
534
 
535
- # Use the consistent MAX_CHARS_TABLE from config
536
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
 
537
  all_chunks.extend(chunks)
538
 
539
  except Exception as e:
540
  log_message(f"Error loading {file_path}: {e}")
541
 
542
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
 
 
543
  return all_chunks
544
 
545
 
 
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
  Only applies to specific patterns to avoid changing legitimate Latin text.
29
+ Returns: (normalized_text, changes_count)
30
  """
31
  if not text:
32
+ return text, 0
33
 
34
  import re
35
 
36
+ changes_count = 0
 
 
37
 
38
  def replace_in_steel_grade(match):
39
  """Replace Latin with Cyrillic only in steel grade context"""
40
+ nonlocal changes_count
41
  grade = match.group(0)
42
+ original_grade = grade
43
+
44
  # Mapping of Latin to Cyrillic for steel designations
45
  replacements = {
46
  'X': 'Х', # Latin X -> Cyrillic Х (Kha)
 
55
  }
56
  for latin, cyrillic in replacements.items():
57
  grade = grade.replace(latin, cyrillic)
58
+
59
+ if grade != original_grade:
60
+ changes_count += 1
61
+
62
  return grade
63
 
64
  # Pattern for steel grades: digits followed by letters and more digits/letters
 
75
  text = re.sub(r'\b[C]-\d{1,2}\b',
76
  lambda m: m.group(0).replace('C', 'С'), text)
77
 
78
+ return text, changes_count
79
 
80
 
81
 
 
85
  chunk_overlap=CHUNK_OVERLAP
86
  )
87
 
88
+ log_message("="*60)
89
+ log_message("NORMALIZING STEEL DESIGNATIONS IN TEXT CHUNKS")
90
+
91
+ total_normalizations = 0
92
+ chunks_with_changes = 0
93
+
94
  chunked = []
95
  for doc in documents:
96
  chunks = text_splitter.get_nodes_from_documents([doc])
97
  for i, chunk in enumerate(chunks):
98
  # Normalize steel designations in the chunk text
99
+ original_text = chunk.text
100
+ chunk.text, changes = normalize_steel_designations(chunk.text)
101
+
102
+ if changes > 0:
103
+ chunks_with_changes += 1
104
+ total_normalizations += changes
105
 
106
  chunk.metadata.update({
107
  'chunk_id': i,
 
117
  max_size = max(len(c.text) for c in chunked)
118
  log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
119
  log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
120
+ log_message(f" Steel designation normalization:")
121
+ log_message(f" - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
122
+ log_message(f" - Total steel grades normalized: {total_normalizations}")
123
+ log_message(f" - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else " - No normalizations needed")
124
+
125
+ log_message("="*60)
126
 
127
  return chunked
128
 
 
136
  sheet_name = table_data.get('sheet_name', '')
137
 
138
  # Apply steel designation normalization to title and section
139
+ table_title, title_changes = normalize_steel_designations(str(table_title))
140
+ section, section_changes = normalize_steel_designations(section)
141
 
142
  table_num_clean = str(table_num).strip()
 
 
 
143
 
144
  import re
145
 
 
176
 
177
  # Normalize all row content (including steel designations)
178
  normalized_rows = []
179
+ total_row_changes = 0
180
+ rows_with_changes = 0
181
+
182
  for row in rows:
183
  if isinstance(row, dict):
184
+ normalized_row = {}
185
+ row_had_changes = False
186
+ for k, v in row.items():
187
+ normalized_val, changes = normalize_steel_designations(str(v))
188
+ normalized_row[k] = normalized_val
189
+ if changes > 0:
190
+ total_row_changes += changes
191
+ row_had_changes = True
192
+ if row_had_changes:
193
+ rows_with_changes += 1
194
  normalized_rows.append(normalized_row)
195
  else:
196
  normalized_rows.append(row)
197
 
198
+ # Log normalization stats for this table
199
+ if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
200
+ log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
201
+ f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
202
+
203
+ # Continue with rest of existing logic using normalized_rows...
204
+ # Calculate base metadata size
205
  base_content = format_table_header(doc_id, table_identifier, table_num,
206
+ table_title, section, headers,
207
+ sheet_name)
208
  base_size = len(base_content)
209
  available_space = max_chars - base_size - 200
210
 
 
219
  'type': 'table',
220
  'document_id': doc_id,
221
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
222
+ 'table_identifier': table_identifier,
223
+ 'table_title': table_title,
224
  'section': section,
225
+ 'sheet_name': sheet_name,
226
  'total_rows': len(normalized_rows),
227
  'chunk_size': len(content),
228
  'is_complete_table': True,
 
229
  'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
230
  }
231
 
232
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
233
  return [Document(text=content, metadata=metadata)]
234
 
235
+ # Chunking logic continues...
236
  chunks = []
237
  current_rows = []
238
  current_size = 0
 
254
  'type': 'table',
255
  'document_id': doc_id,
256
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
257
+ 'table_identifier': table_identifier,
258
+ 'table_title': table_title,
259
  'section': section,
260
  'sheet_name': sheet_name,
261
  'chunk_id': chunk_num,
 
289
  'type': 'table',
290
  'document_id': doc_id,
291
  'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
292
+ 'table_identifier': table_identifier,
293
+ 'table_title': table_title,
294
  'section': section,
295
  'sheet_name': sheet_name,
296
  'chunk_id': chunk_num,
 
308
  return chunks
309
 
310
 
311
+
312
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
313
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
314
 
 
548
 
549
  def load_table_documents(repo_id, hf_token, table_dir):
550
  log_message("Loading tables...")
551
+ log_message("="*60)
552
+ log_message("NORMALIZING STEEL DESIGNATIONS IN TABLES")
553
 
554
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
555
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
556
 
557
  all_chunks = []
558
+ tables_processed = 0
559
+
560
  for file_path in table_files:
561
  try:
562
  local_path = hf_hub_download(
 
573
 
574
  for sheet in data.get('sheets', []):
575
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
576
+ tables_processed += 1
577
 
578
+ chunks = chunk_table_by_content(sheet, sheet_doc_id,
579
+ max_chars=MAX_CHARS_TABLE,
580
+ max_rows=MAX_ROWS_TABLE)
581
  all_chunks.extend(chunks)
582
 
583
  except Exception as e:
584
  log_message(f"Error loading {file_path}: {e}")
585
 
586
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
587
+ log_message("="*60)
588
+
589
  return all_chunks
590
 
591
 
utils.py CHANGED
@@ -201,7 +201,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
201
 
202
  normalized_question = normalize_text(question)
203
  log_message(f"Normalized question: {normalized_question}")
204
- normalized_question_2 = normalize_steel_designations(normalized_question)
205
  log_message(f"After steel normalization: {normalized_question_2}")
206
 
207
  if query_engine is None:
@@ -213,6 +213,8 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
213
  log_message(f"user query: {question}")
214
  log_message(f"normalized query: {normalized_question}")
215
  log_message(f"after steel normalization: {normalized_question_2}")
 
 
216
 
217
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
218
 
 
201
 
202
  normalized_question = normalize_text(question)
203
  log_message(f"Normalized question: {normalized_question}")
204
+ normalized_question_2, query_changes = normalize_steel_designations(question)
205
  log_message(f"After steel normalization: {normalized_question_2}")
206
 
207
  if query_engine is None:
 
213
  log_message(f"user query: {question}")
214
  log_message(f"normalized query: {normalized_question}")
215
  log_message(f"after steel normalization: {normalized_question_2}")
216
+ log_message(f"Steel grades normalized in query: {query_changes}")
217
+
218
 
219
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
220