Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

4834e86

1 Parent(s): 11e130c

removed normalization

Browse files

Files changed (2) hide show

documents_prep.py +1 -13
utils.py +22 -41

documents_prep.py CHANGED Viewed

@@ -183,10 +183,7 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
         type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
         if type_match:
             connection_type = type_match.group(0)
-            # NORMALIZE: Convert Cyrillic to Latin for consistency
-            connection_type_normalized = connection_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
-            # Show BOTH in content for searchability
-            content += f"ТИП СОЕДИНЕНИЯ: {connection_type} ({connection_type_normalized})\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
@@ -446,15 +443,6 @@ def load_table_documents(repo_id, hf_token, table_dir):
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
-    log_message("="*60)
-    log_message("CONNECTION TYPE ENCODING CHECK:")
-    for chunk in all_chunks[:50]:  # Check first 50
-        conn_type = chunk.metadata.get('connection_type', '')
-        if 'C' in conn_type or 'С' in conn_type:
-            # Show both representations
-            log_message(f"  Original: '{conn_type}' | Bytes: {conn_type.encode('utf-8')}")
-    log_message("="*60)
     return all_chunks

         type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
         if type_match:
             connection_type = type_match.group(0)
+            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks

utils.py CHANGED Viewed

@@ -173,26 +173,6 @@ def deduplicate_nodes(nodes):
     return unique_nodes
-def normalize_query(query):
-    """Normalize Cyrillic connection types to Latin in queries"""
-    import re
-    # Find all connection type patterns
-    pattern = r'[СУUTC]-?\d+(?:-\d+)?'
-    def replace_func(match):
-        conn_type = match.group(0)
-        # Convert Cyrillic to Latin
-        normalized = conn_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
-        return normalized
-    normalized_query = re.sub(pattern, replace_func, query)
-    if normalized_query != query:
-        log_message(f"Query normalized: '{query}' → '{normalized_query}'")
-    return normalized_query
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
@@ -200,21 +180,16 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
     try:
         start_time = time.time()
-        # NORMALIZE query for better matching
-        normalized_question = normalize_query(question)
-        # Use normalized query for retrieval
-        retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
-        log_message(f"Original query: {question}")
-        if normalized_question != question:
-            log_message(f"Normalized query: {normalized_question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
-        # Check for connection types - now check for NORMALIZED version
         conn_types_retrieved = {}
         for node in unique_retrieved:
             if node.metadata.get('type') == 'table':
@@ -227,21 +202,27 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
             for ct, cnt in sorted(conn_types_retrieved.items()):
                 log_message(f"  {ct}: {cnt} chunks")
-        # Check for the target type (normalized)
-        target_type = normalize_query("С-25")  # Will become "C-25" or "C25"
-        log_message(f"Checking for target connection type: {target_type}")
-        if any(t in question for t in ['С-25', 'C-25', 'C25']):
-            found_types = [ct for ct in conn_types_retrieved.keys()
-                          if 'C25' in ct or 'C-25' in ct]
-            if found_types:
-                log_message(f"✓ C-25 variants RETRIEVED: {found_types}")
             else:
-                log_message("✗ C-25 NOT RETRIEVED despite being in query!")
-        # Rest continues with normalized_question
-        reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
-        # Use ORIGINAL question for final response (user sees their original)
         response = query_engine.query(question)
         end_time = time.time()

     return unique_nodes
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
+        # DON'T normalize - use original query directly
+        retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"user query: {question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
+        # Check for connection types
         conn_types_retrieved = {}
         for node in unique_retrieved:
             if node.metadata.get('type') == 'table':
             for ct, cnt in sorted(conn_types_retrieved.items()):
                 log_message(f"  {ct}: {cnt} chunks")
+        # Check if target type was retrieved (keep original Cyrillic)
+        if 'С-25' in question:  # Use Cyrillic
+            if 'С-25' in conn_types_retrieved:
+                log_message(f"✓ С-25 RETRIEVED: {conn_types_retrieved['С-25']} chunks")
             else:
+                log_message("✗ С-25 NOT RETRIEVED despite being in query!")
+        # Sample of retrieved tables
+        log_message("SAMPLE OF RETRIEVED TABLES:")
+        for i, node in enumerate(unique_retrieved[:10]):
+            if node.metadata.get('type') == 'table':
+                table_num = node.metadata.get('table_number', 'N/A')
+                table_title = node.metadata.get('table_title', 'N/A')
+                conn_type = node.metadata.get('connection_type', 'N/A')
+                doc_id = node.metadata.get('document_id', 'N/A')
+                log_message(f"  [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
+        # Rerank
+        reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
+        # Direct query without formatting
         response = query_engine.query(question)
         end_time = time.time()