Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

11e130c

1 Parent(s): 9c9aff4

added the new function to replace latin crylic c25

Browse files

Files changed (2) hide show

documents_prep.py +13 -2
utils.py +43 -22

documents_prep.py CHANGED Viewed

@@ -183,7 +183,10 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
         type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
         if type_match:
             connection_type = type_match.group(0)
-            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
@@ -443,6 +446,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks
@@ -494,7 +506,6 @@ def load_image_documents(repo_id, hf_token, image_dir):
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
-    """Main loader - combines all document types"""
     log_message("="*60)
     log_message("STARTING DOCUMENT LOADING")
     log_message("="*60)

         type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
         if type_match:
             connection_type = type_match.group(0)
+            # NORMALIZE: Convert Cyrillic to Latin for consistency
+            connection_type_normalized = connection_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
+            # Show BOTH in content for searchability
+            content += f"ТИП СОЕДИНЕНИЯ: {connection_type} ({connection_type_normalized})\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    log_message("="*60)
+    log_message("CONNECTION TYPE ENCODING CHECK:")
+    for chunk in all_chunks[:50]:  # Check first 50
+        conn_type = chunk.metadata.get('connection_type', '')
+        if 'C' in conn_type or 'С' in conn_type:
+            # Show both representations
+            log_message(f"  Original: '{conn_type}' | Bytes: {conn_type.encode('utf-8')}")
+    log_message("="*60)
     return all_chunks
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     log_message("="*60)
     log_message("STARTING DOCUMENT LOADING")
     log_message("="*60)

utils.py CHANGED Viewed

@@ -173,20 +173,48 @@ def deduplicate_nodes(nodes):
     return unique_nodes
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
-        retrieved_nodes = query_engine.retriever.retrieve(question)
-        log_message(f"user query: {question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
-        # NEW: Check for connection types in retrieved nodes
         conn_types_retrieved = {}
         for node in unique_retrieved:
             if node.metadata.get('type') == 'table':
@@ -199,28 +227,21 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
             for ct, cnt in sorted(conn_types_retrieved.items()):
                 log_message(f"  {ct}: {cnt} chunks")
-        # Check if С-25 was retrieved
-        if 'С-25' in question:
-            if 'С-25' in conn_types_retrieved:
-                log_message(f"✓ С-25 RETRIEVED: {conn_types_retrieved['С-25']} chunks")
             else:
-                log_message("✗ С-25 NOT RETRIEVED despite being in query!")
-        # Log sample of retrieved tables
-        log_message("SAMPLE OF RETRIEVED TABLES:")
-        for i, node in enumerate(unique_retrieved[:10]):
-            if node.metadata.get('type') == 'table':
-                table_num = node.metadata.get('table_number', 'N/A')
-                table_title = node.metadata.get('table_title', 'N/A')
-                conn_type = node.metadata.get('connection_type', 'N/A')
-                doc_id = node.metadata.get('document_id', 'N/A')
-                log_message(f"  [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
-        # Rerank
-        reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
-        # Direct query without formatting
         response = query_engine.query(question)
         end_time = time.time()

     return unique_nodes
+def normalize_query(query):
+    """Normalize Cyrillic connection types to Latin in queries"""
+    import re
+    # Find all connection type patterns
+    pattern = r'[СУUTC]-?\d+(?:-\d+)?'
+    def replace_func(match):
+        conn_type = match.group(0)
+        # Convert Cyrillic to Latin
+        normalized = conn_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
+        return normalized
+    normalized_query = re.sub(pattern, replace_func, query)
+    if normalized_query != query:
+        log_message(f"Query normalized: '{query}' → '{normalized_query}'")
+    return normalized_query
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
+        # NORMALIZE query for better matching
+        normalized_question = normalize_query(question)
+        # Use normalized query for retrieval
+        retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
+        log_message(f"Original query: {question}")
+        if normalized_question != question:
+            log_message(f"Normalized query: {normalized_question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
+        # Check for connection types - now check for NORMALIZED version
         conn_types_retrieved = {}
         for node in unique_retrieved:
             if node.metadata.get('type') == 'table':
             for ct, cnt in sorted(conn_types_retrieved.items()):
                 log_message(f"  {ct}: {cnt} chunks")
+        # Check for the target type (normalized)
+        target_type = normalize_query("С-25")  # Will become "C-25" or "C25"
+        log_message(f"Checking for target connection type: {target_type}")
+        if any(t in question for t in ['С-25', 'C-25', 'C25']):
+            found_types = [ct for ct in conn_types_retrieved.keys()
+                          if 'C25' in ct or 'C-25' in ct]
+            if found_types:
+                log_message(f"✓ C-25 variants RETRIEVED: {found_types}")
             else:
+                log_message("✗ C-25 NOT RETRIEVED despite being in query!")
+        # Rest continues with normalized_question
+        reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
+        # Use ORIGINAL question for final response (user sees their original)
         response = query_engine.query(question)
         end_time = time.time()