Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

e03faa9

1 Parent(s): 24b6b59

normalized fixed + in header text as well

Browse files

Files changed (1) hide show

index_retriever.py +4 -16

index_retriever.py CHANGED Viewed

@@ -11,23 +11,10 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
 def create_vector_index(documents):
     log_message("Строю векторный индекс")
-    # PREPROCESS ALL DOCUMENTS FOR CONSISTENT TOKENIZATION
-    processed_docs = []
     connection_type_sources = {}
     table_count = 0
     for doc in documents:
-        # Normalize text content for BM25
-        if hasattr(doc, 'text'):
-            from documents_prep import normalize_connection_type
-            normalized_text = normalize_connection_type(doc.text)
-            # Create a new Document with normalized text and same metadata
-            doc = Document(
-                text=normalized_text,
-                metadata=doc.metadata
-            )
-        processed_docs.append(doc)
         if doc.metadata.get('type') == 'table':
             table_count += 1
             conn_type = doc.metadata.get('connection_type', '')
@@ -38,16 +25,17 @@ def create_vector_index(documents):
                 connection_type_sources[conn_type].append(table_id)
     log_message("="*60)
-    log_message(f"INDEXING {table_count} TABLE CHUNKS (NORMALIZED)")
     log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
     for conn_type in sorted(connection_type_sources.keys()):
-        sources = list(set(connection_type_sources[conn_type]))
         log_message(f"  {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
         for src in sources:
             log_message(f"    - {src}")
     log_message("="*60)
-    return VectorStoreIndex.from_documents(processed_docs)
 def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
     if not nodes or not reranker:

 def create_vector_index(documents):
     log_message("Строю векторный индекс")
     connection_type_sources = {}
     table_count = 0
     for doc in documents:
         if doc.metadata.get('type') == 'table':
             table_count += 1
             conn_type = doc.metadata.get('connection_type', '')
                 connection_type_sources[conn_type].append(table_id)
     log_message("="*60)
+    log_message(f"INDEXING {table_count} TABLE CHUNKS")
     log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
     for conn_type in sorted(connection_type_sources.keys()):
+        sources = list(set(connection_type_sources[conn_type]))  # Unique sources
         log_message(f"  {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
         for src in sources:
             log_message(f"    - {src}")
     log_message("="*60)
+    return VectorStoreIndex.from_documents(documents)
 def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
     if not nodes or not reranker: