testing / interview /vectorize_service.py
Danielsz's picture
deploy
16e8be2
Raw
History Blame Contribute Delete
3.14 kB
"""
Qdrant vectorization service for interview phrases.
Creates/recreates the 'entrevista_itca' collection and upserts all phrase embeddings.
"""
import logging
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from .phrases_data import setup_data
log = logging.getLogger("interview.vectorize")
import os
QDRANT_HOST = os.getenv("QDRANT_HOST", "127.0.0.1")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333))
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "")
COLLECTION_NAME = "entrevista_itca"
MODEL_NAME = "all-MiniLM-L6-v2"
def vectorize_collection():
"""
Load phrases, generate embeddings, and upsert into Qdrant.
Recreates the collection from scratch.
"""
log.info("Connecting to Qdrant at %s:%s...", QDRANT_HOST, QDRANT_PORT)
if QDRANT_API_KEY:
client = QdrantClient(url=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY)
else:
client = QdrantClient(QDRANT_HOST, port=QDRANT_PORT)
log.info("Loading SentenceTransformer model '%s'...", MODEL_NAME)
model = SentenceTransformer(MODEL_NAME)
vector_size = model.get_sentence_embedding_dimension()
# Recreate collection
collections = [c.name for c in client.get_collections().collections]
if COLLECTION_NAME in collections:
log.info("Deleting existing collection '%s'...", COLLECTION_NAME)
client.delete_collection(COLLECTION_NAME)
log.info("Creating collection '%s' with dimension %d...", COLLECTION_NAME, vector_size)
client.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)
# ponytail: ensure payload keyword indexes exist for category/sentiment
try:
from qdrant_client.models import PayloadSchemaType
client.create_payload_index(
collection_name=COLLECTION_NAME,
field_name="category",
field_schema=PayloadSchemaType.KEYWORD,
)
client.create_payload_index(
collection_name=COLLECTION_NAME,
field_name="sentiment",
field_schema=PayloadSchemaType.KEYWORD,
)
log.info("Payload keyword indexes created for category and sentiment.")
except Exception as e:
log.warning("Could not create payload indexes: %s", e)
# Load phrases
phrases = setup_data()
if not phrases:
log.warning("No phrases loaded. Aborting vectorization.")
return
log.info("Generating embeddings and upserting %d phrases...", len(phrases))
points = []
for idx, p in enumerate(phrases):
vec = model.encode(p["text"]).tolist()
payload = {
"id": p["id"],
"text": p["text"],
"category": p["category"],
"sentiment": p.get("sentiment", "neutral"),
}
points.append(PointStruct(id=idx, vector=vec, payload=payload))
client.upsert(collection_name=COLLECTION_NAME, points=points)
log.info("Vectorization complete! %d points upserted.", len(points))