"""Embedding generation via HuggingFace Inference API (no local torch needed).""" import os import logging logger = logging.getLogger(__name__) MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" EMBEDDING_DIM = 384 BATCH_SIZE = 96 # HF Inference API batch limit def _get_client(): """Get HuggingFace InferenceClient.""" from huggingface_hub import InferenceClient token = os.environ.get("HF_TOKEN") return InferenceClient(token=token) def generate(texts: list[str]) -> list[list[float]]: """Encode texts into embedding vectors via HF Inference API.""" client = _get_client() all_embeddings = [] # Process in batches for i in range(0, len(texts), BATCH_SIZE): batch = texts[i : i + BATCH_SIZE] result = client.feature_extraction(batch, model=MODEL_NAME) all_embeddings.extend(result) logger.info("Generated %d embeddings via HF Inference API", len(all_embeddings)) return [[float(x) for x in emb] for emb in all_embeddings] def generate_query(query: str) -> list[float]: """Embed a single query string (for future RAG Engine use).""" return generate([query])[0]