Spaces:
Sleeping
Sleeping
Commit
·
7062aff
1
Parent(s):
46dedf9
top k 200, 50 + max chunk size = 10 000, max chunk row = 40
Browse files- index_retriever.py +35 -1
- utils.py +0 -22
index_retriever.py
CHANGED
|
@@ -51,7 +51,7 @@ def create_query_engine(vector_index):
|
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
-
similarity_top_k=
|
| 55 |
similarity_cutoff=0.35
|
| 56 |
)
|
| 57 |
|
|
@@ -73,7 +73,41 @@ def create_query_engine(vector_index):
|
|
| 73 |
)
|
| 74 |
|
| 75 |
log_message("Query engine успешно создан")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
return query_engine
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
log_message(f"Ошибка создания query engine: {str(e)}")
|
|
|
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
+
similarity_top_k=50,
|
| 55 |
similarity_cutoff=0.35
|
| 56 |
)
|
| 57 |
|
|
|
|
| 73 |
)
|
| 74 |
|
| 75 |
log_message("Query engine успешно создан")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
all_nodes = list(vector_index.docstore.docs.values())
|
| 79 |
+
c25_tables = []
|
| 80 |
+
|
| 81 |
+
for node_id, node in vector_index.docstore.docs.items():
|
| 82 |
+
metadata = node.metadata
|
| 83 |
+
text = node.get_content()
|
| 84 |
+
|
| 85 |
+
# Check if this is a С-25 table
|
| 86 |
+
if ('С-25' in text or 'C-25' in text or
|
| 87 |
+
'С-25' in str(metadata.get('table_title', '')) or
|
| 88 |
+
'С-25' in str(metadata.get('table_number', ''))):
|
| 89 |
+
|
| 90 |
+
c25_tables.append({
|
| 91 |
+
'node_id': node_id,
|
| 92 |
+
'doc_id': metadata.get('document_id'),
|
| 93 |
+
'table_num': metadata.get('table_number'),
|
| 94 |
+
'table_title': metadata.get('table_title', ''),
|
| 95 |
+
'text_preview': text[:200]
|
| 96 |
+
})
|
| 97 |
+
|
| 98 |
+
log_message(f"\n{'='*70}")
|
| 99 |
+
log_message(f"DEBUG: Found {len(c25_tables)} С-25 tables in index:")
|
| 100 |
+
for t in c25_tables:
|
| 101 |
+
log_message(f" • {t['doc_id']} - Table {t['table_num']}")
|
| 102 |
+
log_message(f" Title: {t['table_title']}")
|
| 103 |
+
log_message(f" Preview: {t['text_preview']}")
|
| 104 |
+
log_message(f"{'='*70}\n")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
return query_engine
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
|
| 112 |
except Exception as e:
|
| 113 |
log_message(f"Ошибка создания query engine: {str(e)}")
|
utils.py
CHANGED
|
@@ -172,28 +172,6 @@ def deduplicate_nodes(nodes):
|
|
| 172 |
|
| 173 |
return unique_nodes
|
| 174 |
|
| 175 |
-
def debug_search_tables(vector_index, search_term="С-25"):
|
| 176 |
-
"""Debug function to find all tables containing a specific term"""
|
| 177 |
-
all_nodes = list(vector_index.docstore.docs.values())
|
| 178 |
-
|
| 179 |
-
matching = []
|
| 180 |
-
for node in all_nodes:
|
| 181 |
-
if node.metadata.get('type') == 'table':
|
| 182 |
-
text = node.get_content()
|
| 183 |
-
if search_term in text or search_term in node.metadata.get('table_title', ''):
|
| 184 |
-
matching.append({
|
| 185 |
-
'doc_id': node.metadata.get('document_id'),
|
| 186 |
-
'table_num': node.metadata.get('table_number'),
|
| 187 |
-
'title': node.metadata.get('table_title', '')[:100]
|
| 188 |
-
})
|
| 189 |
-
|
| 190 |
-
log_message(f"\n{'='*60}")
|
| 191 |
-
log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
|
| 192 |
-
for m in matching:
|
| 193 |
-
log_message(f" • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
|
| 194 |
-
log_message(f"{'='*60}\n")
|
| 195 |
-
|
| 196 |
-
return matching
|
| 197 |
|
| 198 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 199 |
if query_engine is None:
|
|
|
|
| 172 |
|
| 173 |
return unique_nodes
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 177 |
if query_engine is None:
|