Spaces:

Danielsz
/

testing

Sleeping

App Files Files Community

testing / interview /vectorize_service.py

Danielsz

deploy

16e8be2 7 days ago

Raw

History Blame Contribute Delete

3.14 kB

	"""
	Qdrant vectorization service for interview phrases.
	Creates/recreates the 'entrevista_itca' collection and upserts all phrase embeddings.
	"""

	import logging

	from qdrant_client import QdrantClient
	from qdrant_client.models import VectorParams, Distance, PointStruct
	from sentence_transformers import SentenceTransformer

	from .phrases_data import setup_data

	log = logging.getLogger("interview.vectorize")

	import os

	QDRANT_HOST = os.getenv("QDRANT_HOST", "127.0.0.1")
	QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333))
	QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "")
	COLLECTION_NAME = "entrevista_itca"
	MODEL_NAME = "all-MiniLM-L6-v2"


	def vectorize_collection():
	"""
	Load phrases, generate embeddings, and upsert into Qdrant.
	Recreates the collection from scratch.
	"""
	log.info("Connecting to Qdrant at %s:%s...", QDRANT_HOST, QDRANT_PORT)
	if QDRANT_API_KEY:
	client = QdrantClient(url=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY)
	else:
	client = QdrantClient(QDRANT_HOST, port=QDRANT_PORT)

	log.info("Loading SentenceTransformer model '%s'...", MODEL_NAME)
	model = SentenceTransformer(MODEL_NAME)
	vector_size = model.get_sentence_embedding_dimension()

	# Recreate collection
	collections = [c.name for c in client.get_collections().collections]
	if COLLECTION_NAME in collections:
	log.info("Deleting existing collection '%s'...", COLLECTION_NAME)
	client.delete_collection(COLLECTION_NAME)

	log.info("Creating collection '%s' with dimension %d...", COLLECTION_NAME, vector_size)
	client.create_collection(
	collection_name=COLLECTION_NAME,
	vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
	)

	# ponytail: ensure payload keyword indexes exist for category/sentiment
	try:
	from qdrant_client.models import PayloadSchemaType
	client.create_payload_index(
	collection_name=COLLECTION_NAME,
	field_name="category",
	field_schema=PayloadSchemaType.KEYWORD,
	)
	client.create_payload_index(
	collection_name=COLLECTION_NAME,
	field_name="sentiment",
	field_schema=PayloadSchemaType.KEYWORD,
	)
	log.info("Payload keyword indexes created for category and sentiment.")
	except Exception as e:
	log.warning("Could not create payload indexes: %s", e)

	# Load phrases
	phrases = setup_data()
	if not phrases:
	log.warning("No phrases loaded. Aborting vectorization.")
	return

	log.info("Generating embeddings and upserting %d phrases...", len(phrases))
	points = []
	for idx, p in enumerate(phrases):
	vec = model.encode(p["text"]).tolist()
	payload = {
	"id": p["id"],
	"text": p["text"],
	"category": p["category"],
	"sentiment": p.get("sentiment", "neutral"),
	}
	points.append(PointStruct(id=idx, vector=vec, payload=payload))

	client.upsert(collection_name=COLLECTION_NAME, points=points)
	log.info("Vectorization complete! %d points upserted.", len(points))