MrSimple07 commited on
Commit
04f5154
·
1 Parent(s): aafe88b

big debug change

Browse files
Files changed (3) hide show
  1. documents_prep.py +13 -2
  2. index_retriever.py +30 -51
  3. utils.py +32 -11
documents_prep.py CHANGED
@@ -491,8 +491,6 @@ def load_image_documents(repo_id, hf_token, image_dir):
491
  log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
492
 
493
  return documents
494
-
495
-
496
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
497
  """Main loader - combines all document types"""
498
  log_message("="*60)
@@ -506,6 +504,19 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
506
  # Load tables (already chunked)
507
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  # Load images (no chunking needed)
510
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
511
 
 
491
  log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
492
 
493
  return documents
 
 
494
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
495
  """Main loader - combines all document types"""
496
  log_message("="*60)
 
504
  # Load tables (already chunked)
505
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
506
 
507
+ # NEW: Analyze connection types in tables
508
+ connection_types = {}
509
+ for chunk in table_chunks:
510
+ conn_type = chunk.metadata.get('connection_type', '')
511
+ if conn_type:
512
+ connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
513
+
514
+ log_message("="*60)
515
+ log_message("CONNECTION TYPES FOUND IN TABLES:")
516
+ for conn_type, count in sorted(connection_types.items()):
517
+ log_message(f" {conn_type}: {count} chunks")
518
+ log_message("="*60)
519
+
520
  # Load images (no chunking needed)
521
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
522
 
index_retriever.py CHANGED
@@ -10,6 +10,30 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  return VectorStoreIndex.from_documents(documents)
14
 
15
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
@@ -44,66 +68,20 @@ def create_query_engine(vector_index):
44
  try:
45
  from config import CUSTOM_PROMPT
46
 
47
- # Preprocess query to expand table number patterns
48
- class TableAwareRetriever:
49
- def __init__(self, base_retriever):
50
- self.base_retriever = base_retriever
51
-
52
- def retrieve(self, query_str):
53
- import re
54
-
55
- # Expand queries with table numbers
56
- queries = [query_str]
57
-
58
- # Extract table numbers like С-25, C-25, С25
59
- table_patterns = re.findall(r'[СCс]-?\s*\d+', query_str)
60
- if table_patterns:
61
- for pattern in table_patterns:
62
- # Normalize: "С-25" -> ["С-25", "C-25", "С25", "C25"]
63
- normalized = pattern.upper().replace(' ', '')
64
- variants = [
65
- normalized,
66
- normalized.replace('С', 'C'),
67
- normalized.replace('-', ''),
68
- normalized.replace('С', 'C').replace('-', '')
69
- ]
70
- for variant in variants:
71
- queries.append(f"тип соединения {variant}")
72
- queries.append(f"таблица {variant}")
73
-
74
- log_message(f"Searching with {len(queries)} query variants: {queries[:3]}...")
75
-
76
- # Retrieve with all variants
77
- all_nodes = []
78
- seen_ids = set()
79
-
80
- for q in queries:
81
- nodes = self.base_retriever.retrieve(q)
82
- for node in nodes:
83
- node_id = id(node)
84
- if node_id not in seen_ids:
85
- seen_ids.add(node_id)
86
- all_nodes.append(node)
87
-
88
- return all_nodes
89
-
90
  bm25_retriever = BM25Retriever.from_defaults(
91
  docstore=vector_index.docstore,
92
- similarity_top_k=100
93
  )
94
 
95
  vector_retriever = VectorIndexRetriever(
96
  index=vector_index,
97
- similarity_top_k=50,
98
- similarity_cutoff=0.3 # Lower threshold
99
  )
100
 
101
- # Wrap retrievers with table-aware logic
102
- table_aware_bm25 = TableAwareRetriever(bm25_retriever)
103
-
104
  hybrid_retriever = QueryFusionRetriever(
105
- [vector_retriever, table_aware_bm25],
106
- similarity_top_k=200, # Increase to capture more candidates
107
  num_queries=1
108
  )
109
 
@@ -120,6 +98,7 @@ def create_query_engine(vector_index):
120
 
121
  log_message("Query engine успешно создан")
122
  return query_engine
 
123
  except Exception as e:
124
  log_message(f"Ошибка создания query engine: {str(e)}")
125
  raise
 
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
+
14
+ # NEW: Analyze connection types before indexing
15
+ connection_types = {}
16
+ table_count = 0
17
+ for doc in documents:
18
+ if doc.metadata.get('type') == 'table':
19
+ table_count += 1
20
+ conn_type = doc.metadata.get('connection_type', '')
21
+ if conn_type:
22
+ connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
23
+
24
+ log_message("="*60)
25
+ log_message(f"INDEXING {table_count} TABLE CHUNKS")
26
+ log_message("CONNECTION TYPES IN INDEX:")
27
+ for conn_type, count in sorted(connection_types.items()):
28
+ log_message(f" {conn_type}: {count} chunks")
29
+
30
+ # Check for С-25 specifically
31
+ if 'С-25' in connection_types:
32
+ log_message(f"✓ С-25 FOUND: {connection_types['С-25']} chunks")
33
+ else:
34
+ log_message("✗ С-25 NOT FOUND IN INDEX!")
35
+ log_message("="*60)
36
+
37
  return VectorStoreIndex.from_documents(documents)
38
 
39
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
 
68
  try:
69
  from config import CUSTOM_PROMPT
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  bm25_retriever = BM25Retriever.from_defaults(
72
  docstore=vector_index.docstore,
73
+ similarity_top_k=200
74
  )
75
 
76
  vector_retriever = VectorIndexRetriever(
77
  index=vector_index,
78
+ similarity_top_k=200,
79
+ similarity_cutoff=0.15
80
  )
81
 
 
 
 
82
  hybrid_retriever = QueryFusionRetriever(
83
+ [vector_retriever, bm25_retriever],
84
+ similarity_top_k=150,
85
  num_queries=1
86
  )
87
 
 
98
 
99
  log_message("Query engine успешно создан")
100
  return query_engine
101
+
102
  except Exception as e:
103
  log_message(f"Ошибка создания query engine: {str(e)}")
104
  raise
utils.py CHANGED
@@ -181,24 +181,45 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
181
  start_time = time.time()
182
  retrieved_nodes = query_engine.retriever.retrieve(question)
183
  log_message(f"user query: {question}")
184
-
185
-
186
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
187
 
188
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
189
-
190
- # DEBUG: Log what was retrieved
191
- log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
192
- for i, node in enumerate(unique_retrieved): # All debug
193
- table_num = node.metadata.get('table_number', 'N/A')
194
- table_title = node.metadata.get('table_title', 'N/A')
195
- doc_id = node.metadata.get('document_id', 'N/A')
196
- log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
197
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
198
 
199
- # Simple reranking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
201
 
 
202
  # Direct query without formatting
203
  response = query_engine.query(question)
204
 
 
181
  start_time = time.time()
182
  retrieved_nodes = query_engine.retriever.retrieve(question)
183
  log_message(f"user query: {question}")
 
 
184
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
185
 
186
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
 
 
 
 
 
 
187
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
188
 
189
+ # NEW: Check for connection types in retrieved nodes
190
+ conn_types_retrieved = {}
191
+ for node in unique_retrieved:
192
+ if node.metadata.get('type') == 'table':
193
+ conn_type = node.metadata.get('connection_type', '')
194
+ if conn_type:
195
+ conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
196
+
197
+ if conn_types_retrieved:
198
+ log_message("CONNECTION TYPES IN RETRIEVED:")
199
+ for ct, cnt in sorted(conn_types_retrieved.items()):
200
+ log_message(f" {ct}: {cnt} chunks")
201
+
202
+ # Check if С-25 was retrieved
203
+ if 'С-25' in question:
204
+ if 'С-25' in conn_types_retrieved:
205
+ log_message(f"✓ С-25 RETRIEVED: {conn_types_retrieved['С-25']} chunks")
206
+ else:
207
+ log_message("✗ С-25 NOT RETRIEVED despite being in query!")
208
+
209
+ # Log sample of retrieved tables
210
+ log_message("SAMPLE OF RETRIEVED TABLES:")
211
+ for i, node in enumerate(unique_retrieved[:10]):
212
+ if node.metadata.get('type') == 'table':
213
+ table_num = node.metadata.get('table_number', 'N/A')
214
+ table_title = node.metadata.get('table_title', 'N/A')
215
+ conn_type = node.metadata.get('connection_type', 'N/A')
216
+ doc_id = node.metadata.get('document_id', 'N/A')
217
+ log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
218
+
219
+ # Rerank
220
  reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
221
 
222
+
223
  # Direct query without formatting
224
  response = query_engine.query(question)
225