|
|
import os |
|
|
from typing import Any |
|
|
|
|
|
from dotenv import find_dotenv, load_dotenv |
|
|
from huggingface_hub import login |
|
|
from llama_index.core import VectorStoreIndex |
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
|
from llama_index.vector_stores.milvus import MilvusVectorStore |
|
|
|
|
|
from src.agent_hackathon.consts import PROJECT_ROOT_DIR |
|
|
from src.agent_hackathon.logger import get_logger |
|
|
|
|
|
logger = get_logger(log_name="query_vector_db", log_dir=PROJECT_ROOT_DIR / "logs") |
|
|
|
|
|
|
|
|
class RetrieverEngineBuilder: |
|
|
""" |
|
|
Handles the creation of a query engine for a vector database using HuggingFace and LlamaIndex. |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
hf_token_env: str = "HF_TOKEN", |
|
|
embedding_model: str = "Qwen/Qwen3-Embedding-0.6B", |
|
|
vector_store: MilvusVectorStore = None, |
|
|
device: str = "cpu", |
|
|
) -> None: |
|
|
""" |
|
|
Initialize the QueryEngineBuilder. |
|
|
|
|
|
Args: |
|
|
hf_token_env: Environment variable name for HuggingFace token. |
|
|
embedding_model: Name of the embedding model. |
|
|
vector_store: An instance of MilvusVectorStore. |
|
|
device: Device to run the embedding model on. |
|
|
""" |
|
|
self.hf_token_env = hf_token_env |
|
|
self.embedding_model = embedding_model |
|
|
self.vector_store = vector_store |
|
|
self.device = device |
|
|
|
|
|
logger.info("Initializing RetrieverEngineBuilder.") |
|
|
|
|
|
|
|
|
|
|
|
self.embed_model = HuggingFaceEmbedding( |
|
|
model_name=self.embedding_model, device=self.device |
|
|
) |
|
|
logger.info("RetrieverEngineBuilder initialized.") |
|
|
|
|
|
def _login_huggingface(self) -> None: |
|
|
"""Login to HuggingFace using the token from environment variable.""" |
|
|
logger.info("Logging in to HuggingFace.") |
|
|
login(token=os.getenv(key=self.hf_token_env)) |
|
|
logger.info("Logged in to HuggingFace.") |
|
|
|
|
|
def _load_env(self) -> None: |
|
|
"""Load environment variables from .env file.""" |
|
|
logger.info("Loading environment variables.") |
|
|
_ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False)) |
|
|
logger.info("Environment variables loaded.") |
|
|
|
|
|
def build_retriever_engine(self) -> Any: |
|
|
""" |
|
|
Build and return the retriever engine. |
|
|
|
|
|
Returns: |
|
|
Retriever engine object. |
|
|
""" |
|
|
logger.info("Building retriever engine.") |
|
|
index = VectorStoreIndex.from_vector_store( |
|
|
vector_store=self.vector_store, embed_model=self.embed_model |
|
|
) |
|
|
retriever = index.as_retriever( |
|
|
vector_store_query_mode="hybrid", |
|
|
similarity_top_k=5, |
|
|
) |
|
|
logger.info("Retriever engine built.") |
|
|
return retriever |
|
|
|