""" Qdrant vectorization service for interview phrases. Creates/recreates the 'entrevista_itca' collection and upserts all phrase embeddings. """ import logging from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance, PointStruct from sentence_transformers import SentenceTransformer from .phrases_data import setup_data log = logging.getLogger("interview.vectorize") import os QDRANT_HOST = os.getenv("QDRANT_HOST", "127.0.0.1") QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333)) QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "") COLLECTION_NAME = "entrevista_itca" MODEL_NAME = "all-MiniLM-L6-v2" def vectorize_collection(): """ Load phrases, generate embeddings, and upsert into Qdrant. Recreates the collection from scratch. """ log.info("Connecting to Qdrant at %s:%s...", QDRANT_HOST, QDRANT_PORT) if QDRANT_API_KEY: client = QdrantClient(url=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY) else: client = QdrantClient(QDRANT_HOST, port=QDRANT_PORT) log.info("Loading SentenceTransformer model '%s'...", MODEL_NAME) model = SentenceTransformer(MODEL_NAME) vector_size = model.get_sentence_embedding_dimension() # Recreate collection collections = [c.name for c in client.get_collections().collections] if COLLECTION_NAME in collections: log.info("Deleting existing collection '%s'...", COLLECTION_NAME) client.delete_collection(COLLECTION_NAME) log.info("Creating collection '%s' with dimension %d...", COLLECTION_NAME, vector_size) client.create_collection( collection_name=COLLECTION_NAME, vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE), ) # ponytail: ensure payload keyword indexes exist for category/sentiment try: from qdrant_client.models import PayloadSchemaType client.create_payload_index( collection_name=COLLECTION_NAME, field_name="category", field_schema=PayloadSchemaType.KEYWORD, ) client.create_payload_index( collection_name=COLLECTION_NAME, field_name="sentiment", field_schema=PayloadSchemaType.KEYWORD, ) log.info("Payload keyword indexes created for category and sentiment.") except Exception as e: log.warning("Could not create payload indexes: %s", e) # Load phrases phrases = setup_data() if not phrases: log.warning("No phrases loaded. Aborting vectorization.") return log.info("Generating embeddings and upserting %d phrases...", len(phrases)) points = [] for idx, p in enumerate(phrases): vec = model.encode(p["text"]).tolist() payload = { "id": p["id"], "text": p["text"], "category": p["category"], "sentiment": p.get("sentiment", "neutral"), } points.append(PointStruct(id=idx, vector=vec, payload=payload)) client.upsert(collection_name=COLLECTION_NAME, points=points) log.info("Vectorization complete! %d points upserted.", len(points))