Spaces:
Sleeping
Sleeping
Commit ·
e03faa9
1
Parent(s): 24b6b59
normalized fixed + in header text as well
Browse files- index_retriever.py +4 -16
index_retriever.py
CHANGED
|
@@ -11,23 +11,10 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
|
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
|
| 14 |
-
# PREPROCESS ALL DOCUMENTS FOR CONSISTENT TOKENIZATION
|
| 15 |
-
processed_docs = []
|
| 16 |
connection_type_sources = {}
|
| 17 |
table_count = 0
|
| 18 |
|
| 19 |
for doc in documents:
|
| 20 |
-
# Normalize text content for BM25
|
| 21 |
-
if hasattr(doc, 'text'):
|
| 22 |
-
from documents_prep import normalize_connection_type
|
| 23 |
-
normalized_text = normalize_connection_type(doc.text)
|
| 24 |
-
# Create a new Document with normalized text and same metadata
|
| 25 |
-
doc = Document(
|
| 26 |
-
text=normalized_text,
|
| 27 |
-
metadata=doc.metadata
|
| 28 |
-
)
|
| 29 |
-
processed_docs.append(doc)
|
| 30 |
-
|
| 31 |
if doc.metadata.get('type') == 'table':
|
| 32 |
table_count += 1
|
| 33 |
conn_type = doc.metadata.get('connection_type', '')
|
|
@@ -38,16 +25,17 @@ def create_vector_index(documents):
|
|
| 38 |
connection_type_sources[conn_type].append(table_id)
|
| 39 |
|
| 40 |
log_message("="*60)
|
| 41 |
-
log_message(f"INDEXING {table_count} TABLE CHUNKS
|
| 42 |
log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
|
| 43 |
for conn_type in sorted(connection_type_sources.keys()):
|
| 44 |
-
sources = list(set(connection_type_sources[conn_type]))
|
| 45 |
log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
|
| 46 |
for src in sources:
|
| 47 |
log_message(f" - {src}")
|
| 48 |
log_message("="*60)
|
| 49 |
|
| 50 |
-
return VectorStoreIndex.from_documents(
|
|
|
|
| 51 |
|
| 52 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 53 |
if not nodes or not reranker:
|
|
|
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
|
|
|
|
|
|
|
| 14 |
connection_type_sources = {}
|
| 15 |
table_count = 0
|
| 16 |
|
| 17 |
for doc in documents:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
if doc.metadata.get('type') == 'table':
|
| 19 |
table_count += 1
|
| 20 |
conn_type = doc.metadata.get('connection_type', '')
|
|
|
|
| 25 |
connection_type_sources[conn_type].append(table_id)
|
| 26 |
|
| 27 |
log_message("="*60)
|
| 28 |
+
log_message(f"INDEXING {table_count} TABLE CHUNKS")
|
| 29 |
log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
|
| 30 |
for conn_type in sorted(connection_type_sources.keys()):
|
| 31 |
+
sources = list(set(connection_type_sources[conn_type])) # Unique sources
|
| 32 |
log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
|
| 33 |
for src in sources:
|
| 34 |
log_message(f" - {src}")
|
| 35 |
log_message("="*60)
|
| 36 |
|
| 37 |
+
return VectorStoreIndex.from_documents(documents)
|
| 38 |
+
|
| 39 |
|
| 40 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 41 |
if not nodes or not reranker:
|