| """ |
| Qdrant vectorization service for interview phrases. |
| Creates/recreates the 'entrevista_itca' collection and upserts all phrase embeddings. |
| """ |
|
|
| import logging |
|
|
| from qdrant_client import QdrantClient |
| from qdrant_client.models import VectorParams, Distance, PointStruct |
| from sentence_transformers import SentenceTransformer |
|
|
| from .phrases_data import setup_data |
|
|
| log = logging.getLogger("interview.vectorize") |
|
|
| import os |
|
|
| QDRANT_HOST = os.getenv("QDRANT_HOST", "127.0.0.1") |
| QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333)) |
| QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "") |
| COLLECTION_NAME = "entrevista_itca" |
| MODEL_NAME = "all-MiniLM-L6-v2" |
|
|
|
|
| def vectorize_collection(): |
| """ |
| Load phrases, generate embeddings, and upsert into Qdrant. |
| Recreates the collection from scratch. |
| """ |
| log.info("Connecting to Qdrant at %s:%s...", QDRANT_HOST, QDRANT_PORT) |
| if QDRANT_API_KEY: |
| client = QdrantClient(url=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY) |
| else: |
| client = QdrantClient(QDRANT_HOST, port=QDRANT_PORT) |
|
|
| log.info("Loading SentenceTransformer model '%s'...", MODEL_NAME) |
| model = SentenceTransformer(MODEL_NAME) |
| vector_size = model.get_sentence_embedding_dimension() |
|
|
| |
| collections = [c.name for c in client.get_collections().collections] |
| if COLLECTION_NAME in collections: |
| log.info("Deleting existing collection '%s'...", COLLECTION_NAME) |
| client.delete_collection(COLLECTION_NAME) |
|
|
| log.info("Creating collection '%s' with dimension %d...", COLLECTION_NAME, vector_size) |
| client.create_collection( |
| collection_name=COLLECTION_NAME, |
| vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE), |
| ) |
|
|
| |
| try: |
| from qdrant_client.models import PayloadSchemaType |
| client.create_payload_index( |
| collection_name=COLLECTION_NAME, |
| field_name="category", |
| field_schema=PayloadSchemaType.KEYWORD, |
| ) |
| client.create_payload_index( |
| collection_name=COLLECTION_NAME, |
| field_name="sentiment", |
| field_schema=PayloadSchemaType.KEYWORD, |
| ) |
| log.info("Payload keyword indexes created for category and sentiment.") |
| except Exception as e: |
| log.warning("Could not create payload indexes: %s", e) |
|
|
| |
| phrases = setup_data() |
| if not phrases: |
| log.warning("No phrases loaded. Aborting vectorization.") |
| return |
|
|
| log.info("Generating embeddings and upserting %d phrases...", len(phrases)) |
| points = [] |
| for idx, p in enumerate(phrases): |
| vec = model.encode(p["text"]).tolist() |
| payload = { |
| "id": p["id"], |
| "text": p["text"], |
| "category": p["category"], |
| "sentiment": p.get("sentiment", "neutral"), |
| } |
| points.append(PointStruct(id=idx, vector=vec, payload=payload)) |
|
|
| client.upsert(collection_name=COLLECTION_NAME, points=points) |
| log.info("Vectorization complete! %d points upserted.", len(points)) |
|
|