Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 15, 2025

Commit

9ce9909

1 Parent(s): fbed18d

added a new loggers for normalizations

Browse files

Files changed (4) hide show

checking_cosine.py +12 -23
chunk_similarity_results_2025-10-15_13-26-33.txt +7 -0
documents_prep.py +73 -27
utils.py +3 -1

checking_cosine.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import numpy as np
 from sentence_transformers import SentenceTransformer, util
 from datetime import datetime
 EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 QUERY = "по каким стандартам может быть применена сталь 08X18H10T?"
@@ -77,11 +78,9 @@ CHUNK_3000_30="""
 """
-import re
 mapping = {
-    'X': 'Х', 'H': 'Н', 'T': 'Т', 'C': 'С', 'B': 'В', 'K': 'К', 'M': 'М', 'A': 'А', 'R': 'Р',
-    'x': 'х', 'h': 'н', 't': 'т', 'c': 'с', 'b': 'в', 'k': 'к', 'm': 'м', 'a': 'а', 'r': 'р'
 }
 token_re = re.compile(r'\b[0-9A-Za-zА-Яа-яЁё\-\+_/\.]+\b')
@@ -102,16 +101,6 @@ def replace_latin_in_steel_tokens(text):
             return token
     return token_re.sub(repl_token, text)
-# Пример использования:
-chunk_fixed = replace_latin_in_steel_tokens(CHUNK_FULL)
-chunk_fixed_2 = replace_latin_in_steel_tokens(CHUNK_SHORT)
-chunk_fixed_3 = replace_latin_in_steel_tokens(CHUNK_3000_30)
-chunk_fixed_4 = replace_latin_in_steel_tokens(CHUNK_FULL)
-query_fixed = replace_latin_in_steel_tokens(QUERY)
-# затем model.encode([query_fixed, chunk_fixed, ...])
 def cosine_similarity(a, b):
     return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
@@ -119,20 +108,23 @@ def main():
     model = SentenceTransformer(EMBEDDING_MODEL)
     print(f"🔹 Loaded embedding model: {EMBEDDING_MODEL}\n")
-    # Encode all texts
     embeddings = model.encode([query_fixed, chunk_fixed, chunk_fixed_2, chunk_fixed_3])
-    query_emb, full_emb, short_emb, sim_3000_30 = embeddings
-    # Compute cosine similarities
     sim_full = cosine_similarity(query_emb, full_emb)
     sim_short = cosine_similarity(query_emb, short_emb)
-    sim_3000_30 = cosine_similarity(query_emb, sim_3000_30)
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     result_text = (
         f"Запрос: {QUERY}\n\n"
         f"Сходство (полный чанк): {sim_full:.4f}\n"
-        f"Сходство (сокращённый чанк): {sim_short:.4f}\n\n"
         f"Сходство (чанк 3000 символов, 30 строк): {sim_3000_30:.4f}\n\n"
         f"Вывод: {'Сокращённый чанк ближе к запросу' if sim_short > sim_full else 'Полный чанк ближе к запросу'}\n"
     )
@@ -144,8 +136,5 @@ def main():
     print(result_text)
     print(f"✅ Результаты сохранены в файл: {output_file}")
-# ===============================================================
-# ENTRY POINT
-# ===============================================================
 if __name__ == "__main__":
-    main()

 import numpy as np
 from sentence_transformers import SentenceTransformer, util
 from datetime import datetime
+import re
 EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 QUERY = "по каким стандартам может быть применена сталь 08X18H10T?"
 """
 mapping = {
+    'X': 'Х', 'H': 'Н', 'T': 'Т', 'C': 'С', 'B': 'В', 'K': 'К', 'M': 'М', 'A': 'А', 'R': 'Р', 'P': 'Р',
+    'x': 'х', 'h': 'н', 't': 'т', 'c': 'с', 'b': 'в', 'k': 'к', 'm': 'м', 'a': 'а', 'r': 'р', 'p': 'р'
 }
 token_re = re.compile(r'\b[0-9A-Za-zА-Яа-яЁё\-\+_/\.]+\b')
             return token
     return token_re.sub(repl_token, text)
 def cosine_similarity(a, b):
     return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
     model = SentenceTransformer(EMBEDDING_MODEL)
     print(f"🔹 Loaded embedding model: {EMBEDDING_MODEL}\n")
+    query_fixed = replace_latin_in_steel_tokens(QUERY)
+    chunk_fixed = replace_latin_in_steel_tokens(CHUNK_FULL)
+    chunk_fixed_2 = replace_latin_in_steel_tokens(CHUNK_SHORT)
+    chunk_fixed_3 = replace_latin_in_steel_tokens(CHUNK_3000_30)
     embeddings = model.encode([query_fixed, chunk_fixed, chunk_fixed_2, chunk_fixed_3])
+    query_emb, full_emb, short_emb, chunk_3000_emb = embeddings
     sim_full = cosine_similarity(query_emb, full_emb)
     sim_short = cosine_similarity(query_emb, short_emb)
+    sim_3000_30 = cosine_similarity(query_emb, chunk_3000_emb)
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     result_text = (
         f"Запрос: {QUERY}\n\n"
         f"Сходство (полный чанк): {sim_full:.4f}\n"
+        f"Сходство (сокращённый чанк): {sim_short:.4f}\n"
         f"Сходство (чанк 3000 символов, 30 строк): {sim_3000_30:.4f}\n\n"
         f"Вывод: {'Сокращённый чанк ближе к запросу' if sim_short > sim_full else 'Полный чанк ближе к запросу'}\n"
     )
     print(result_text)
     print(f"✅ Результаты сохранены в файл: {output_file}")
 if __name__ == "__main__":
+    main()

chunk_similarity_results_2025-10-15_13-26-33.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Запрос: по каким стандартам может быть применена сталь 08X18H10T?
+Сходство (полный чанк): 0.5152
+Сходство (сокращённый чанк): 0.5219
+Сходство (чанк 3000 символов, 30 строк): 0.5152
+Вывод: Сокращённый чанк ближе к запросу

documents_prep.py CHANGED Viewed

@@ -26,19 +26,21 @@ def normalize_steel_designations(text):
     """
     Convert Latin letters to Cyrillic in steel designations.
     Only applies to specific patterns to avoid changing legitimate Latin text.
     """
     if not text:
-        return text
     import re
-    # Pattern 1: Steel grades like 08X18H10T, 12X18H10T, etc.
-    # Format: digits + Latin letters (no spaces typically)
-    # Common steel designation pattern: [\d]+[XHTKBMCAP]+[\d]*[XHTKBMCAP]*
     def replace_in_steel_grade(match):
         """Replace Latin with Cyrillic only in steel grade context"""
         grade = match.group(0)
         # Mapping of Latin to Cyrillic for steel designations
         replacements = {
             'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
@@ -53,6 +55,10 @@ def normalize_steel_designations(text):
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
         return grade
     # Pattern for steel grades: digits followed by letters and more digits/letters
@@ -69,7 +75,7 @@ def normalize_steel_designations(text):
     text = re.sub(r'\b[C]-\d{1,2}\b',
                   lambda m: m.group(0).replace('C', 'С'), text)
-    return text
@@ -79,12 +85,23 @@ def chunk_text_documents(documents):
         chunk_overlap=CHUNK_OVERLAP
     )
     chunked = []
     for doc in documents:
         chunks = text_splitter.get_nodes_from_documents([doc])
         for i, chunk in enumerate(chunks):
             # Normalize steel designations in the chunk text
-            chunk.text = normalize_steel_designations(chunk.text)
             chunk.metadata.update({
                 'chunk_id': i,
@@ -100,6 +117,12 @@ def chunk_text_documents(documents):
         max_size = max(len(c.text) for c in chunked)
         log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
         log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
     return chunked
@@ -113,13 +136,10 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
-    table_title = normalize_steel_designations(str(table_title))
-    section = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
-    table_title_normalized = normalize_text(str(table_title))
-    import re
     import re
@@ -156,17 +176,35 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     # Normalize all row content (including steel designations)
     normalized_rows = []
     for row in rows:
         if isinstance(row, dict):
-            normalized_row = {k: normalize_steel_designations(str(v)) for k, v in row.items()}
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
-    # Calculate base metadata size with NORMALIZED title
     base_content = format_table_header(doc_id, table_identifier, table_num,
-                                       table_title_normalized, section, headers,
-                                       sheet_name)  # Pass sheet_name
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
@@ -181,21 +219,20 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-            'table_identifier': normalize_text(table_identifier),
-            'table_title': table_title_normalized,
             'section': section,
-            'sheet_name': sheet_name,  # ADD THIS
             'total_rows': len(normalized_rows),
             'chunk_size': len(content),
             'is_complete_table': True,
-            # ADD SEARCHABLE KEYWORDS
             'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]
-    # Chunking logic continues with normalized_rows instead of rows...
     chunks = []
     current_rows = []
     current_size = 0
@@ -217,8 +254,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-                'table_identifier': normalize_text(table_identifier),
-                'table_title': table_title_normalized,
                 'section': section,
                 'sheet_name': sheet_name,
                 'chunk_id': chunk_num,
@@ -252,8 +289,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-            'table_identifier': normalize_text(table_identifier),
-            'table_title': table_title_normalized,
             'section': section,
             'sheet_name': sheet_name,
             'chunk_id': chunk_num,
@@ -271,6 +308,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
     content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
@@ -510,11 +548,15 @@ def extract_sections_from_json(json_path):
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
@@ -531,15 +573,19 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # Use the consistent MAX_CHARS_TABLE from config
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks

     """
     Convert Latin letters to Cyrillic in steel designations.
     Only applies to specific patterns to avoid changing legitimate Latin text.
+    Returns: (normalized_text, changes_count)
     """
     if not text:
+        return text, 0
     import re
+    changes_count = 0
     def replace_in_steel_grade(match):
         """Replace Latin with Cyrillic only in steel grade context"""
+        nonlocal changes_count
         grade = match.group(0)
+        original_grade = grade
         # Mapping of Latin to Cyrillic for steel designations
         replacements = {
             'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
+        if grade != original_grade:
+            changes_count += 1
         return grade
     # Pattern for steel grades: digits followed by letters and more digits/letters
     text = re.sub(r'\b[C]-\d{1,2}\b',
                   lambda m: m.group(0).replace('C', 'С'), text)
+    return text, changes_count
         chunk_overlap=CHUNK_OVERLAP
     )
+    log_message("="*60)
+    log_message("NORMALIZING STEEL DESIGNATIONS IN TEXT CHUNKS")
+    total_normalizations = 0
+    chunks_with_changes = 0
     chunked = []
     for doc in documents:
         chunks = text_splitter.get_nodes_from_documents([doc])
         for i, chunk in enumerate(chunks):
             # Normalize steel designations in the chunk text
+            original_text = chunk.text
+            chunk.text, changes = normalize_steel_designations(chunk.text)
+            if changes > 0:
+                chunks_with_changes += 1
+                total_normalizations += changes
             chunk.metadata.update({
                 'chunk_id': i,
         max_size = max(len(c.text) for c in chunked)
         log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
         log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
+        log_message(f"  Steel designation normalization:")
+        log_message(f"    - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
+        log_message(f"    - Total steel grades normalized: {total_normalizations}")
+        log_message(f"    - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else "    - No normalizations needed")
+    log_message("="*60)
     return chunked
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
+    table_title, title_changes = normalize_steel_designations(str(table_title))
+    section, section_changes = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
     import re
     # Normalize all row content (including steel designations)
     normalized_rows = []
+    total_row_changes = 0
+    rows_with_changes = 0
     for row in rows:
         if isinstance(row, dict):
+            normalized_row = {}
+            row_had_changes = False
+            for k, v in row.items():
+                normalized_val, changes = normalize_steel_designations(str(v))
+                normalized_row[k] = normalized_val
+                if changes > 0:
+                    total_row_changes += changes
+                    row_had_changes = True
+            if row_had_changes:
+                rows_with_changes += 1
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
+    # Log normalization stats for this table
+    if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
+        log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
+                   f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
+    # Continue with rest of existing logic using normalized_rows...
+    # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,
+                                       table_title, section, headers,
+                                       sheet_name)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
             'section': section,
+            'sheet_name': sheet_name,
             'total_rows': len(normalized_rows),
             'chunk_size': len(content),
             'is_complete_table': True,
             'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]
+    # Chunking logic continues...
     chunks = []
     current_rows = []
     current_size = 0
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+                'table_identifier': table_identifier,
+                'table_title': table_title,
                 'section': section,
                 'sheet_name': sheet_name,
                 'chunk_id': chunk_num,
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
             'section': section,
             'sheet_name': sheet_name,
             'chunk_id': chunk_num,
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
     content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
+    log_message("="*60)
+    log_message("NORMALIZING STEEL DESIGNATIONS IN TABLES")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
+    tables_processed = 0
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                tables_processed += 1
+                chunks = chunk_table_by_content(sheet, sheet_doc_id,
+                                               max_chars=MAX_CHARS_TABLE,
+                                               max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
+    log_message("="*60)
     return all_chunks

utils.py CHANGED Viewed

@@ -201,7 +201,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
     normalized_question = normalize_text(question)
     log_message(f"Normalized question: {normalized_question}")
-    normalized_question_2 = normalize_steel_designations(normalized_question)
     log_message(f"After steel normalization: {normalized_question_2}")
     if query_engine is None:
@@ -213,6 +213,8 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
         log_message(f"user query: {question}")
         log_message(f"normalized query: {normalized_question}")
         log_message(f"after steel normalization: {normalized_question_2}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")

     normalized_question = normalize_text(question)
     log_message(f"Normalized question: {normalized_question}")
+    normalized_question_2, query_changes = normalize_steel_designations(question)
     log_message(f"After steel normalization: {normalized_question_2}")
     if query_engine is None:
         log_message(f"user query: {question}")
         log_message(f"normalized query: {normalized_question}")
         log_message(f"after steel normalization: {normalized_question_2}")
+        log_message(f"Steel grades normalized in query: {query_changes}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")