|
|
|
|
|
|
|
|
import os
|
|
|
import hashlib
|
|
|
from qdrant_client import QdrantClient
|
|
|
from qdrant_client.http.models import PointStruct, VectorParams, Distance
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "document_store")
|
|
|
QDRANT_URL = os.getenv("QDRANT_URL")
|
|
|
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
|
|
|
|
|
|
|
|
client = QdrantClient(
|
|
|
url=QDRANT_URL,
|
|
|
api_key=QDRANT_API_KEY
|
|
|
)
|
|
|
|
|
|
|
|
|
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
|
|
|
|
|
def init_collection():
|
|
|
collections = client.get_collections().collections
|
|
|
if COLLECTION_NAME not in [col.name for col in collections]:
|
|
|
client.recreate_collection(
|
|
|
collection_name=COLLECTION_NAME,
|
|
|
vectors_config=VectorParams(size=384, distance=Distance.COSINE)
|
|
|
)
|
|
|
|
|
|
|
|
|
init_collection()
|
|
|
|
|
|
|
|
|
def compute_id(filename):
|
|
|
return int(hashlib.md5(filename.encode()).hexdigest()[:16], 16)
|
|
|
|
|
|
|
|
|
def get_entry(filename):
|
|
|
point_id = compute_id(filename)
|
|
|
result = client.retrieve(collection_name=COLLECTION_NAME, ids=[point_id])
|
|
|
return result[0].payload if result else None
|
|
|
|
|
|
|
|
|
def upsert_entry(filename, **fields):
|
|
|
init_collection()
|
|
|
|
|
|
if "filename" in fields:
|
|
|
fields.pop("filename")
|
|
|
|
|
|
point_id = compute_id(filename)
|
|
|
existing = get_entry(filename) or {}
|
|
|
|
|
|
|
|
|
payload = {**existing, **{k: v for k, v in fields.items() if v is not None}}
|
|
|
|
|
|
|
|
|
base_text = payload.get("text", "")
|
|
|
if not isinstance(base_text, str):
|
|
|
base_text = str(base_text)
|
|
|
|
|
|
try:
|
|
|
vector = model.encode(base_text, normalize_embeddings=True).tolist()
|
|
|
except Exception as e:
|
|
|
print(f"β Vector encoding failed for {filename}: {e}")
|
|
|
vector = [0.0] * 384
|
|
|
|
|
|
|
|
|
payload = {"filename": filename, **payload}
|
|
|
|
|
|
client.upsert(
|
|
|
collection_name=COLLECTION_NAME,
|
|
|
points=[PointStruct(id=point_id, vector=vector, payload=payload)]
|
|
|
)
|
|
|
|
|
|
|