Spaces:
Build error
Build error
| import config | |
| import utils | |
| from nltk.tokenize import word_tokenize | |
| from typing import List | |
| import nltk | |
| import torch | |
| import pickle | |
| from langchain.docstore.document import Document as LangchainDocument | |
| from rank_bm25 import BM25Okapi | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores.utils import DistanceStrategy | |
| from langchain.retrievers import EnsembleRetriever | |
| from langchain_community.retrievers import BM25Retriever | |
| import os | |
| def create_vector_db(docs: List[LangchainDocument]): | |
| db_path: str = config.DB_PATH | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| embedding_model = HuggingFaceEmbeddings( | |
| model_name=config.EMBEDDING_MODEL_NAME, | |
| multi_process=True, | |
| model_kwargs={"device": device}, | |
| encode_kwargs={"normalize_embeddings": True}, | |
| ) | |
| if os.path.exists(db_path): | |
| print(f"Завантаження векторної бази даних з {db_path}") | |
| knowledge_vector_database = FAISS.load_local( | |
| db_path, | |
| embedding_model, | |
| allow_dangerous_deserialization=True | |
| ) | |
| return knowledge_vector_database | |
| elif docs is not None: | |
| print("Створення нової векторної бази даних") | |
| knowledge_vector_database = FAISS.from_documents( | |
| docs, embedding_model, distance_strategy=DistanceStrategy.COSINE | |
| ) | |
| knowledge_vector_database.save_local(db_path) | |
| print(f"Векторна база даних збережена в {db_path}") | |
| return knowledge_vector_database | |
| else: | |
| raise ValueError( | |
| """Documents are missing! | |
| Please load the documents and set get_data=True in app.py.""" | |
| ) | |
| def create_bm25(docs: List[LangchainDocument]): | |
| bm25_path: str = config.BM25_PATH | |
| if os.path.exists(bm25_path): | |
| print(f"Завантаження BM25 індексу з {bm25_path}") | |
| with open(bm25_path, "rb") as file: | |
| bm25 = pickle.load(file) | |
| return bm25 | |
| elif docs is not None: | |
| print("Створення нового BM25 індексу") | |
| tokenized_docs = [word_tokenize(doc.page_content.lower()) for doc in docs] | |
| bm25 = BM25Okapi(tokenized_docs) | |
| with open(bm25_path, "wb") as file: | |
| pickle.dump(bm25, file) | |
| print(f"BM25 індекс збережено в {bm25_path}") | |
| return bm25 | |
| else: | |
| raise ValueError( | |
| """Documents are missing! | |
| Please load the documents and set get_data=True in app.py.""" | |
| ) | |
| def search(docs_processed, bm_25: BM25Okapi, vector_db: FAISS, query, top_k, use_bm25=True, use_semantic_search=True): | |
| if use_bm25 and use_semantic_search: | |
| bm25_retriever = BM25Retriever.from_documents(docs_processed) | |
| bm25_retriever.k = top_k | |
| faiss_retriever = vector_db.as_retriever(search_kwargs={"k": top_k}) | |
| ensemble_retriever = EnsembleRetriever( | |
| retrievers=[bm25_retriever, faiss_retriever], | |
| weights=[0.5, 0.5] | |
| ) | |
| result = ensemble_retriever.invoke(query) | |
| return result | |
| elif use_bm25: | |
| tokenized_query = word_tokenize(query.lower()) | |
| result = bm_25.get_top_n(tokenized_query, [doc.page_content for doc in docs_processed], n=top_k) | |
| elif use_semantic_search: | |
| result = vector_db.similarity_search(query, k=top_k) | |
| else: | |
| result = [] | |
| return result | |