import os from typing import Any from dotenv import find_dotenv, load_dotenv from huggingface_hub import login from llama_index.core import VectorStoreIndex from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.vector_stores.milvus import MilvusVectorStore from src.agent_hackathon.consts import PROJECT_ROOT_DIR from src.agent_hackathon.logger import get_logger logger = get_logger(log_name="query_vector_db", log_dir=PROJECT_ROOT_DIR / "logs") class RetrieverEngineBuilder: """ Handles the creation of a query engine for a vector database using HuggingFace and LlamaIndex. """ def __init__( self, hf_token_env: str = "HF_TOKEN", embedding_model: str = "Qwen/Qwen3-Embedding-0.6B", vector_store: MilvusVectorStore = None, device: str = "cpu", ) -> None: """ Initialize the QueryEngineBuilder. Args: hf_token_env: Environment variable name for HuggingFace token. embedding_model: Name of the embedding model. vector_store: An instance of MilvusVectorStore. device: Device to run the embedding model on. """ self.hf_token_env = hf_token_env self.embedding_model = embedding_model self.vector_store = vector_store self.device = device logger.info("Initializing RetrieverEngineBuilder.") # self._login_huggingface() # self._load_env() self.embed_model = HuggingFaceEmbedding( model_name=self.embedding_model, device=self.device ) logger.info("RetrieverEngineBuilder initialized.") def _login_huggingface(self) -> None: """Login to HuggingFace using the token from environment variable.""" logger.info("Logging in to HuggingFace.") login(token=os.getenv(key=self.hf_token_env)) logger.info("Logged in to HuggingFace.") def _load_env(self) -> None: """Load environment variables from .env file.""" logger.info("Loading environment variables.") _ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False)) logger.info("Environment variables loaded.") def build_retriever_engine(self) -> Any: """ Build and return the retriever engine. Returns: Retriever engine object. """ logger.info("Building retriever engine.") index = VectorStoreIndex.from_vector_store( vector_store=self.vector_store, embed_model=self.embed_model ) retriever = index.as_retriever( vector_store_query_mode="hybrid", similarity_top_k=5, ) logger.info("Retriever engine built.") return retriever