Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

9c77451

1 Parent(s): 60178fd

fixing normalizing hypens

Browse files

Files changed (3) hide show

documents_prep.py +28 -24
index_retriever.py +13 -6
utils.py +18 -2

documents_prep.py CHANGED Viewed

@@ -39,16 +39,17 @@ def normalize_connection_type(s):
     s = s.replace('С', 'C').replace('с', 'c')
     s = s.replace('У', 'U').replace('у', 'u')
     s = s.replace('Т', 'T').replace('т', 't')
-    # REMOVE HYPHENS for consistent tokenization
     s = s.replace('-', '')
     return s
 def extract_connection_type(text):
     import re
-    # Match with or without hyphen
-    match = re.search(r'[СCс]-?\d+(?:-\d+)?', text)
     if match:
-        return normalize_connection_type(match.group(0))
     return ''
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
@@ -181,23 +182,17 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    # Start with clear identification
     content = f"ДОКУМЕНТ: {doc_id}\n"
     content += f"ТАБЛИЦА: {table_identifier}\n"
-    # Extract and emphasize the connection type if present
     if table_title:
         content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
-        # Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
-        import re
-        type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
-        if type_match:
-            connection_type = type_match.group(0)
-            # NORMALIZE: Convert Cyrillic to Latin for consistency
-            connection_type_normalized = connection_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
-            # Show BOTH in content for searchability
-            content += f"ТИП СОЕДИНЕНИЯ: {connection_type} ({connection_type_normalized})\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
@@ -207,7 +202,6 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
     content += f"\n{'='*70}\n"
-    # Add headers with better formatting
     if headers:
         content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
         for i, h in enumerate(headers, 1):
@@ -432,6 +426,8 @@ def load_table_documents(repo_id, hf_token, table_dir):
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
@@ -448,27 +444,35 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # Use the consistent MAX_CHARS_TABLE from config
                 chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     log_message("="*60)
-    log_message("CONNECTION TYPE ENCODING CHECK:")
-    for chunk in all_chunks[:50]:  # Check first 50
-        conn_type = chunk.metadata.get('connection_type', '')
-        if 'C' in conn_type or 'С' in conn_type:
-            # Show both representations
-            log_message(f"  Original: '{conn_type}' | Bytes: {conn_type.encode('utf-8')}")
     log_message("="*60)
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     """Load image descriptions"""
     log_message("Loading images...")

     s = s.replace('С', 'C').replace('с', 'c')
     s = s.replace('У', 'U').replace('у', 'u')
     s = s.replace('Т', 'T').replace('т', 't')
+    # REMOVE ALL HYPHENS for consistent tokenization
     s = s.replace('-', '')
     return s
 def extract_connection_type(text):
     import re
+    # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
+    match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
     if match:
+        normalized = normalize_connection_type(match.group(0))
+        return normalized
     return ''
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
     content = f"ДОКУМЕНТ: {doc_id}\n"
     content += f"ТАБЛИЦА: {table_identifier}\n"
     if table_title:
         content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
+        # Extract and normalize connection type
+        connection_type = extract_connection_type(table_title)
+        if connection_type:
+            # Show normalized version for searchability
+            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
     content += f"\n{'='*70}\n"
     if headers:
         content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
         for i, h in enumerate(headers, 1):
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
+    connection_type_sources = {}  # Track which table each type comes from
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                table_num = sheet.get('table_number', 'unknown')
+                table_title = sheet.get('table_title', '')
                 chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
+                # Track connection type source
+                conn_type = extract_connection_type(table_title)
+                if conn_type:
+                    if conn_type not in connection_type_sources:
+                        connection_type_sources[conn_type] = []
+                    connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     log_message("="*60)
+    log_message("CONNECTION TYPES AND THEIR SOURCES:")
+    for conn_type in sorted(connection_type_sources.keys()):
+        sources = connection_type_sources[conn_type]
+        log_message(f"  {conn_type}: {len(sources)} tables")
+        for src in sources:
+            log_message(f"    - {src}")
     log_message("="*60)
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     """Load image descriptions"""
     log_message("Loading images...")

index_retriever.py CHANGED Viewed

@@ -11,25 +11,32 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
 def create_vector_index(documents):
     log_message("Строю векторный индекс")
-    # NEW: Analyze connection types before indexing
-    connection_types = {}
     table_count = 0
     for doc in documents:
         if doc.metadata.get('type') == 'table':
             table_count += 1
             conn_type = doc.metadata.get('connection_type', '')
             if conn_type:
-                connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
     log_message("="*60)
     log_message(f"INDEXING {table_count} TABLE CHUNKS")
-    log_message("CONNECTION TYPES IN INDEX:")
-    for conn_type, count in sorted(connection_types.items()):
-        log_message(f"  {conn_type}: {count} chunks")
     log_message("="*60)
     return VectorStoreIndex.from_documents(documents)
 def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
     if not nodes or not reranker:
         return nodes[:top_k]

 def create_vector_index(documents):
     log_message("Строю векторный индекс")
+    connection_type_sources = {}
     table_count = 0
     for doc in documents:
         if doc.metadata.get('type') == 'table':
             table_count += 1
             conn_type = doc.metadata.get('connection_type', '')
             if conn_type:
+                table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
+                if conn_type not in connection_type_sources:
+                    connection_type_sources[conn_type] = []
+                connection_type_sources[conn_type].append(table_id)
     log_message("="*60)
     log_message(f"INDEXING {table_count} TABLE CHUNKS")
+    log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
+    for conn_type in sorted(connection_type_sources.keys()):
+        sources = list(set(connection_type_sources[conn_type]))  # Unique sources
+        log_message(f"  {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
+        for src in sources:
+            log_message(f"    - {src}")
     log_message("="*60)
     return VectorStoreIndex.from_documents(documents)
 def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
     if not nodes or not reranker:
         return nodes[:top_k]

utils.py CHANGED Viewed

@@ -173,6 +173,16 @@ def deduplicate_nodes(nodes):
     return unique_nodes
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
@@ -180,8 +190,14 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
     try:
         start_time = time.time()
-        # DON'T normalize - use original query directly
-        retrieved_nodes = query_engine.retriever.retrieve(question)
         log_message(f"user query: {question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")

     return unique_nodes
+def normalize_query(query):
+    """Normalize query to match stored format"""
+    import re
+    # Replace Cyrillic connection types with Latin
+    query = query.replace('С-', 'C-').replace('с-', 'c-')
+    query = query.replace('У-', 'U-').replace('у-', 'u-')
+    query = query.replace('Т-', 'T-').replace('т-', 't-')
+    return query
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
+        # NORMALIZE QUERY: Convert Cyrillic to Latin
+        normalized_question = normalize_query(question)
+        log_message(f"Original query: {question}")
+        if normalized_question != question:
+            log_message(f"Normalized query: {normalized_question}")
+        # Use normalized query for retrieval
+        retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
         log_message(f"user query: {question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")