MrSimple07 commited on
Commit
e03faa9
·
1 Parent(s): 24b6b59

normalized fixed + in header text as well

Browse files
Files changed (1) hide show
  1. index_retriever.py +4 -16
index_retriever.py CHANGED
@@ -11,23 +11,10 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
 
14
- # PREPROCESS ALL DOCUMENTS FOR CONSISTENT TOKENIZATION
15
- processed_docs = []
16
  connection_type_sources = {}
17
  table_count = 0
18
 
19
  for doc in documents:
20
- # Normalize text content for BM25
21
- if hasattr(doc, 'text'):
22
- from documents_prep import normalize_connection_type
23
- normalized_text = normalize_connection_type(doc.text)
24
- # Create a new Document with normalized text and same metadata
25
- doc = Document(
26
- text=normalized_text,
27
- metadata=doc.metadata
28
- )
29
- processed_docs.append(doc)
30
-
31
  if doc.metadata.get('type') == 'table':
32
  table_count += 1
33
  conn_type = doc.metadata.get('connection_type', '')
@@ -38,16 +25,17 @@ def create_vector_index(documents):
38
  connection_type_sources[conn_type].append(table_id)
39
 
40
  log_message("="*60)
41
- log_message(f"INDEXING {table_count} TABLE CHUNKS (NORMALIZED)")
42
  log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
43
  for conn_type in sorted(connection_type_sources.keys()):
44
- sources = list(set(connection_type_sources[conn_type]))
45
  log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
46
  for src in sources:
47
  log_message(f" - {src}")
48
  log_message("="*60)
49
 
50
- return VectorStoreIndex.from_documents(processed_docs)
 
51
 
52
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
53
  if not nodes or not reranker:
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
 
 
 
14
  connection_type_sources = {}
15
  table_count = 0
16
 
17
  for doc in documents:
 
 
 
 
 
 
 
 
 
 
 
18
  if doc.metadata.get('type') == 'table':
19
  table_count += 1
20
  conn_type = doc.metadata.get('connection_type', '')
 
25
  connection_type_sources[conn_type].append(table_id)
26
 
27
  log_message("="*60)
28
+ log_message(f"INDEXING {table_count} TABLE CHUNKS")
29
  log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
30
  for conn_type in sorted(connection_type_sources.keys()):
31
+ sources = list(set(connection_type_sources[conn_type])) # Unique sources
32
  log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
33
  for src in sources:
34
  log_message(f" - {src}")
35
  log_message("="*60)
36
 
37
+ return VectorStoreIndex.from_documents(documents)
38
+
39
 
40
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
41
  if not nodes or not reranker: