| from llama_index.core import VectorStoreIndex, Settings | |
| from llama_index.core.query_engine import RetrieverQueryEngine | |
| from llama_index.core.retrievers import VectorIndexRetriever | |
| from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode | |
| from llama_index.core.prompts import PromptTemplate | |
| from llama_index.retrievers.bm25 import BM25Retriever | |
| from llama_index.core.retrievers import QueryFusionRetriever | |
| from my_logging import log_message | |
| from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK | |
| def create_vector_index(documents): | |
| log_message("Строю векторный индекс") | |
| connection_type_sources = {} | |
| table_count = 0 | |
| for doc in documents: | |
| if doc.metadata.get('type') == 'table': | |
| table_count += 1 | |
| conn_type = doc.metadata.get('connection_type', '') | |
| if conn_type: | |
| table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}" | |
| if conn_type not in connection_type_sources: | |
| connection_type_sources[conn_type] = [] | |
| connection_type_sources[conn_type].append(table_id) | |
| return VectorStoreIndex.from_documents(documents) | |
| def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5): | |
| if not nodes or not reranker: | |
| return nodes[:top_k] | |
| try: | |
| log_message(f"Переранжирую {len(nodes)} узлов") | |
| pairs = [[query, node.text] for node in nodes] | |
| scores = reranker.predict(pairs) | |
| scored_nodes = list(zip(nodes, scores)) | |
| scored_nodes.sort(key=lambda x: x[1], reverse=True) | |
| filtered = [(node, score) for node, score in scored_nodes if score >= min_score_threshold] | |
| if not filtered: | |
| filtered = scored_nodes[:top_k] | |
| log_message(f"Выбрано {min(len(filtered), top_k)} узлов") | |
| return [node for node, score in filtered[:top_k]] | |
| except Exception as e: | |
| log_message(f"Ошибка переранжировки: {str(e)}") | |
| return nodes[:top_k] | |
| def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50, | |
| similarity_cutoff=0.55, hybrid_top_k=100): | |
| try: | |
| from config import CUSTOM_PROMPT | |
| bm25_retriever = BM25Retriever.from_defaults( | |
| docstore=vector_index.docstore, | |
| similarity_top_k=bm25_top_k | |
| ) | |
| vector_retriever = VectorIndexRetriever( | |
| index=vector_index, | |
| similarity_top_k=vector_top_k, | |
| similarity_cutoff=similarity_cutoff | |
| ) | |
| hybrid_retriever = QueryFusionRetriever( | |
| [vector_retriever, bm25_retriever], | |
| similarity_top_k=hybrid_top_k, | |
| num_queries=1 | |
| ) | |
| custom_prompt_template = PromptTemplate(CUSTOM_PROMPT) | |
| response_synthesizer = get_response_synthesizer( | |
| response_mode=ResponseMode.TREE_SUMMARIZE, | |
| text_qa_template=custom_prompt_template | |
| ) | |
| query_engine = RetrieverQueryEngine( | |
| retriever=hybrid_retriever, | |
| response_synthesizer=response_synthesizer | |
| ) | |
| log_message(f"Query engine created: vector_top_k={vector_top_k}, " | |
| f"bm25_top_k={bm25_top_k}, similarity_cutoff={similarity_cutoff}, " | |
| f"hybrid_top_k={hybrid_top_k}") | |
| return query_engine | |
| except Exception as e: | |
| log_message(f"Ошибка создания query engine: {str(e)}") | |
| raise |