Spaces:
Sleeping
Sleeping
Commit
·
04f5154
1
Parent(s):
aafe88b
big debug change
Browse files- documents_prep.py +13 -2
- index_retriever.py +30 -51
- utils.py +32 -11
documents_prep.py
CHANGED
|
@@ -491,8 +491,6 @@ def load_image_documents(repo_id, hf_token, image_dir):
|
|
| 491 |
log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
|
| 492 |
|
| 493 |
return documents
|
| 494 |
-
|
| 495 |
-
|
| 496 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 497 |
"""Main loader - combines all document types"""
|
| 498 |
log_message("="*60)
|
|
@@ -506,6 +504,19 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
|
| 506 |
# Load tables (already chunked)
|
| 507 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
# Load images (no chunking needed)
|
| 510 |
image_docs = load_image_documents(repo_id, hf_token, image_dir)
|
| 511 |
|
|
|
|
| 491 |
log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
|
| 492 |
|
| 493 |
return documents
|
|
|
|
|
|
|
| 494 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 495 |
"""Main loader - combines all document types"""
|
| 496 |
log_message("="*60)
|
|
|
|
| 504 |
# Load tables (already chunked)
|
| 505 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 506 |
|
| 507 |
+
# NEW: Analyze connection types in tables
|
| 508 |
+
connection_types = {}
|
| 509 |
+
for chunk in table_chunks:
|
| 510 |
+
conn_type = chunk.metadata.get('connection_type', '')
|
| 511 |
+
if conn_type:
|
| 512 |
+
connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
|
| 513 |
+
|
| 514 |
+
log_message("="*60)
|
| 515 |
+
log_message("CONNECTION TYPES FOUND IN TABLES:")
|
| 516 |
+
for conn_type, count in sorted(connection_types.items()):
|
| 517 |
+
log_message(f" {conn_type}: {count} chunks")
|
| 518 |
+
log_message("="*60)
|
| 519 |
+
|
| 520 |
# Load images (no chunking needed)
|
| 521 |
image_docs = load_image_documents(repo_id, hf_token, image_dir)
|
| 522 |
|
index_retriever.py
CHANGED
|
@@ -10,6 +10,30 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
|
|
| 10 |
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
return VectorStoreIndex.from_documents(documents)
|
| 14 |
|
| 15 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
|
@@ -44,66 +68,20 @@ def create_query_engine(vector_index):
|
|
| 44 |
try:
|
| 45 |
from config import CUSTOM_PROMPT
|
| 46 |
|
| 47 |
-
# Preprocess query to expand table number patterns
|
| 48 |
-
class TableAwareRetriever:
|
| 49 |
-
def __init__(self, base_retriever):
|
| 50 |
-
self.base_retriever = base_retriever
|
| 51 |
-
|
| 52 |
-
def retrieve(self, query_str):
|
| 53 |
-
import re
|
| 54 |
-
|
| 55 |
-
# Expand queries with table numbers
|
| 56 |
-
queries = [query_str]
|
| 57 |
-
|
| 58 |
-
# Extract table numbers like С-25, C-25, С25
|
| 59 |
-
table_patterns = re.findall(r'[СCс]-?\s*\d+', query_str)
|
| 60 |
-
if table_patterns:
|
| 61 |
-
for pattern in table_patterns:
|
| 62 |
-
# Normalize: "С-25" -> ["С-25", "C-25", "С25", "C25"]
|
| 63 |
-
normalized = pattern.upper().replace(' ', '')
|
| 64 |
-
variants = [
|
| 65 |
-
normalized,
|
| 66 |
-
normalized.replace('С', 'C'),
|
| 67 |
-
normalized.replace('-', ''),
|
| 68 |
-
normalized.replace('С', 'C').replace('-', '')
|
| 69 |
-
]
|
| 70 |
-
for variant in variants:
|
| 71 |
-
queries.append(f"тип соединения {variant}")
|
| 72 |
-
queries.append(f"таблица {variant}")
|
| 73 |
-
|
| 74 |
-
log_message(f"Searching with {len(queries)} query variants: {queries[:3]}...")
|
| 75 |
-
|
| 76 |
-
# Retrieve with all variants
|
| 77 |
-
all_nodes = []
|
| 78 |
-
seen_ids = set()
|
| 79 |
-
|
| 80 |
-
for q in queries:
|
| 81 |
-
nodes = self.base_retriever.retrieve(q)
|
| 82 |
-
for node in nodes:
|
| 83 |
-
node_id = id(node)
|
| 84 |
-
if node_id not in seen_ids:
|
| 85 |
-
seen_ids.add(node_id)
|
| 86 |
-
all_nodes.append(node)
|
| 87 |
-
|
| 88 |
-
return all_nodes
|
| 89 |
-
|
| 90 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 91 |
docstore=vector_index.docstore,
|
| 92 |
-
similarity_top_k=
|
| 93 |
)
|
| 94 |
|
| 95 |
vector_retriever = VectorIndexRetriever(
|
| 96 |
index=vector_index,
|
| 97 |
-
similarity_top_k=
|
| 98 |
-
similarity_cutoff=0.
|
| 99 |
)
|
| 100 |
|
| 101 |
-
# Wrap retrievers with table-aware logic
|
| 102 |
-
table_aware_bm25 = TableAwareRetriever(bm25_retriever)
|
| 103 |
-
|
| 104 |
hybrid_retriever = QueryFusionRetriever(
|
| 105 |
-
[vector_retriever,
|
| 106 |
-
similarity_top_k=
|
| 107 |
num_queries=1
|
| 108 |
)
|
| 109 |
|
|
@@ -120,6 +98,7 @@ def create_query_engine(vector_index):
|
|
| 120 |
|
| 121 |
log_message("Query engine успешно создан")
|
| 122 |
return query_engine
|
|
|
|
| 123 |
except Exception as e:
|
| 124 |
log_message(f"Ошибка создания query engine: {str(e)}")
|
| 125 |
raise
|
|
|
|
| 10 |
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
+
|
| 14 |
+
# NEW: Analyze connection types before indexing
|
| 15 |
+
connection_types = {}
|
| 16 |
+
table_count = 0
|
| 17 |
+
for doc in documents:
|
| 18 |
+
if doc.metadata.get('type') == 'table':
|
| 19 |
+
table_count += 1
|
| 20 |
+
conn_type = doc.metadata.get('connection_type', '')
|
| 21 |
+
if conn_type:
|
| 22 |
+
connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
|
| 23 |
+
|
| 24 |
+
log_message("="*60)
|
| 25 |
+
log_message(f"INDEXING {table_count} TABLE CHUNKS")
|
| 26 |
+
log_message("CONNECTION TYPES IN INDEX:")
|
| 27 |
+
for conn_type, count in sorted(connection_types.items()):
|
| 28 |
+
log_message(f" {conn_type}: {count} chunks")
|
| 29 |
+
|
| 30 |
+
# Check for С-25 specifically
|
| 31 |
+
if 'С-25' in connection_types:
|
| 32 |
+
log_message(f"✓ С-25 FOUND: {connection_types['С-25']} chunks")
|
| 33 |
+
else:
|
| 34 |
+
log_message("✗ С-25 NOT FOUND IN INDEX!")
|
| 35 |
+
log_message("="*60)
|
| 36 |
+
|
| 37 |
return VectorStoreIndex.from_documents(documents)
|
| 38 |
|
| 39 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
|
|
|
| 68 |
try:
|
| 69 |
from config import CUSTOM_PROMPT
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 72 |
docstore=vector_index.docstore,
|
| 73 |
+
similarity_top_k=200
|
| 74 |
)
|
| 75 |
|
| 76 |
vector_retriever = VectorIndexRetriever(
|
| 77 |
index=vector_index,
|
| 78 |
+
similarity_top_k=200,
|
| 79 |
+
similarity_cutoff=0.15
|
| 80 |
)
|
| 81 |
|
|
|
|
|
|
|
|
|
|
| 82 |
hybrid_retriever = QueryFusionRetriever(
|
| 83 |
+
[vector_retriever, bm25_retriever],
|
| 84 |
+
similarity_top_k=150,
|
| 85 |
num_queries=1
|
| 86 |
)
|
| 87 |
|
|
|
|
| 98 |
|
| 99 |
log_message("Query engine успешно создан")
|
| 100 |
return query_engine
|
| 101 |
+
|
| 102 |
except Exception as e:
|
| 103 |
log_message(f"Ошибка создания query engine: {str(e)}")
|
| 104 |
raise
|
utils.py
CHANGED
|
@@ -181,24 +181,45 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 181 |
start_time = time.time()
|
| 182 |
retrieved_nodes = query_engine.retriever.retrieve(question)
|
| 183 |
log_message(f"user query: {question}")
|
| 184 |
-
|
| 185 |
-
|
| 186 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 187 |
|
| 188 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
| 189 |
-
|
| 190 |
-
# DEBUG: Log what was retrieved
|
| 191 |
-
log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
|
| 192 |
-
for i, node in enumerate(unique_retrieved): # All debug
|
| 193 |
-
table_num = node.metadata.get('table_number', 'N/A')
|
| 194 |
-
table_title = node.metadata.get('table_title', 'N/A')
|
| 195 |
-
doc_id = node.metadata.get('document_id', 'N/A')
|
| 196 |
-
log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
|
| 197 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 198 |
|
| 199 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
|
| 201 |
|
|
|
|
| 202 |
# Direct query without formatting
|
| 203 |
response = query_engine.query(question)
|
| 204 |
|
|
|
|
| 181 |
start_time = time.time()
|
| 182 |
retrieved_nodes = query_engine.retriever.retrieve(question)
|
| 183 |
log_message(f"user query: {question}")
|
|
|
|
|
|
|
| 184 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 185 |
|
| 186 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 188 |
|
| 189 |
+
# NEW: Check for connection types in retrieved nodes
|
| 190 |
+
conn_types_retrieved = {}
|
| 191 |
+
for node in unique_retrieved:
|
| 192 |
+
if node.metadata.get('type') == 'table':
|
| 193 |
+
conn_type = node.metadata.get('connection_type', '')
|
| 194 |
+
if conn_type:
|
| 195 |
+
conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
|
| 196 |
+
|
| 197 |
+
if conn_types_retrieved:
|
| 198 |
+
log_message("CONNECTION TYPES IN RETRIEVED:")
|
| 199 |
+
for ct, cnt in sorted(conn_types_retrieved.items()):
|
| 200 |
+
log_message(f" {ct}: {cnt} chunks")
|
| 201 |
+
|
| 202 |
+
# Check if С-25 was retrieved
|
| 203 |
+
if 'С-25' in question:
|
| 204 |
+
if 'С-25' in conn_types_retrieved:
|
| 205 |
+
log_message(f"✓ С-25 RETRIEVED: {conn_types_retrieved['С-25']} chunks")
|
| 206 |
+
else:
|
| 207 |
+
log_message("✗ С-25 NOT RETRIEVED despite being in query!")
|
| 208 |
+
|
| 209 |
+
# Log sample of retrieved tables
|
| 210 |
+
log_message("SAMPLE OF RETRIEVED TABLES:")
|
| 211 |
+
for i, node in enumerate(unique_retrieved[:10]):
|
| 212 |
+
if node.metadata.get('type') == 'table':
|
| 213 |
+
table_num = node.metadata.get('table_number', 'N/A')
|
| 214 |
+
table_title = node.metadata.get('table_title', 'N/A')
|
| 215 |
+
conn_type = node.metadata.get('connection_type', 'N/A')
|
| 216 |
+
doc_id = node.metadata.get('document_id', 'N/A')
|
| 217 |
+
log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
|
| 218 |
+
|
| 219 |
+
# Rerank
|
| 220 |
reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
|
| 221 |
|
| 222 |
+
|
| 223 |
# Direct query without formatting
|
| 224 |
response = query_engine.query(question)
|
| 225 |
|