Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

57a8908

1 Parent(s): 49bfa92

normalized fixed + in header text as well

Browse files

Files changed (3) hide show

documents_prep.py +20 -9
index_retriever.py +12 -4
utils.py +4 -3

documents_prep.py CHANGED Viewed

@@ -186,12 +186,13 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
     content += f"ТАБЛИЦА: {table_identifier}\n"
     if table_title:
-        content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
-        # Extract and normalize connection type
         connection_type = extract_connection_type(table_title)
         if connection_type:
-            # Show normalized version for searchability
             content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
     if table_num and table_num != table_identifier:
@@ -205,7 +206,9 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
     if headers:
         content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
         for i, h in enumerate(headers, 1):
-            content += f"  {i}. {h}\n"
         content += "\n"
     content += "ДАННЫЕ ТАБЛИЦЫ:\n"
@@ -213,19 +216,27 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
 def format_single_row(row, idx):
-    """Format a single row"""
     if isinstance(row, dict):
-        parts = [f"{k}: {v}" for k, v in row.items()
-                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     elif isinstance(row, list):
-        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     return ""
 def format_table_rows(rows):
     """Format multiple rows"""
     content = ""

     content += f"ТАБЛИЦА: {table_identifier}\n"
     if table_title:
+        # Normalize the title text itself for better searchability
+        normalized_title = normalize_connection_type(table_title)
+        content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
+        # Extract and store the normalized connection type
         connection_type = extract_connection_type(table_title)
         if connection_type:
             content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
     if table_num and table_num != table_identifier:
     if headers:
         content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
         for i, h in enumerate(headers, 1):
+            # NORMALIZE HEADERS TOO
+            normalized_header = normalize_connection_type(h)
+            content += f"  {i}. {normalized_header}\n"
         content += "\n"
     content += "ДАННЫЕ ТАБЛИЦЫ:\n"
 def format_single_row(row, idx):
+    """Format a single row with normalization"""
     if isinstance(row, dict):
+        # NORMALIZE VALUES IN ROWS
+        parts = []
+        for k, v in row.items():
+            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
+                normalized_v = normalize_connection_type(str(v))
+                parts.append(f"{k}: {normalized_v}")
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     elif isinstance(row, list):
+        # NORMALIZE LIST VALUES
+        parts = []
+        for v in row:
+            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
+                normalized_v = normalize_connection_type(str(v))
+                parts.append(normalized_v)
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     return ""
 def format_table_rows(rows):
     """Format multiple rows"""
     content = ""

index_retriever.py CHANGED Viewed

@@ -11,10 +11,19 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
 def create_vector_index(documents):
     log_message("Строю векторный индекс")
     connection_type_sources = {}
     table_count = 0
     for doc in documents:
         if doc.metadata.get('type') == 'table':
             table_count += 1
             conn_type = doc.metadata.get('connection_type', '')
@@ -25,17 +34,16 @@ def create_vector_index(documents):
                 connection_type_sources[conn_type].append(table_id)
     log_message("="*60)
-    log_message(f"INDEXING {table_count} TABLE CHUNKS")
     log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
     for conn_type in sorted(connection_type_sources.keys()):
-        sources = list(set(connection_type_sources[conn_type]))  # Unique sources
         log_message(f"  {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
         for src in sources:
             log_message(f"    - {src}")
     log_message("="*60)
-    return VectorStoreIndex.from_documents(documents)
 def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
     if not nodes or not reranker:

 def create_vector_index(documents):
     log_message("Строю векторный индекс")
+    # PREPROCESS ALL DOCUMENTS FOR CONSISTENT TOKENIZATION
+    processed_docs = []
     connection_type_sources = {}
     table_count = 0
     for doc in documents:
+        # Normalize text content for BM25
+        if hasattr(doc, 'text'):
+            from documents_prep import normalize_connection_type
+            doc.text = normalize_connection_type(doc.text)
+        processed_docs.append(doc)
         if doc.metadata.get('type') == 'table':
             table_count += 1
             conn_type = doc.metadata.get('connection_type', '')
                 connection_type_sources[conn_type].append(table_id)
     log_message("="*60)
+    log_message(f"INDEXING {table_count} TABLE CHUNKS (NORMALIZED)")
     log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
     for conn_type in sorted(connection_type_sources.keys()):
+        sources = list(set(connection_type_sources[conn_type]))
         log_message(f"  {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
         for src in sources:
             log_message(f"    - {src}")
     log_message("="*60)
+    return VectorStoreIndex.from_documents(processed_docs)
 def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
     if not nodes or not reranker:

utils.py CHANGED Viewed

@@ -172,7 +172,6 @@ def deduplicate_nodes(nodes):
     return unique_nodes
 def normalize_query(query):
     """Normalize query to match stored format"""
     import re
@@ -180,8 +179,10 @@ def normalize_query(query):
     query = query.replace('С-', 'C-').replace('с-', 'c-')
     query = query.replace('У-', 'U-').replace('у-', 'u-')
     query = query.replace('Т-', 'T-').replace('т-', 't-')
-    query = query.replace('-', '')
     return query

     return unique_nodes
 def normalize_query(query):
     """Normalize query to match stored format"""
     import re
     query = query.replace('С-', 'C-').replace('с-', 'c-')
     query = query.replace('У-', 'U-').replace('у-', 'u-')
     query = query.replace('Т-', 'T-').replace('т-', 't-')
+    # Remove hyphens from connection type patterns (C-25 -> C25)
+    query = re.sub(r'([CUTcut])(\d)', r'\1\2', query)
     return query