dashVectorSpace / scripts /ingest_ms_marco.py
justmotes's picture
Deploy dashVectorspace v1 (Full)
b92d96d
raw
history blame
3.23 kB
import sys
import os
import numpy as np
from tqdm import tqdm
# Add project root to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import (
NUM_CLUSTERS, FRESHNESS_SHARD_ID, MRL_DIMS,
EMBEDDING_MODELS, ROUTER_MODELS, COLLECTION_NAME,
QDRANT_URL, QDRANT_API_KEY
)
from src.data_pipeline import get_embeddings, load_ms_marco
from src.router import LearnedRouter
from src.vector_db import UnifiedQdrant
def ingest_data():
print(">>> Starting Ingestion Pipeline for Qdrant Cloud...")
if QDRANT_URL == ":memory:":
print("WARNING: QDRANT_URL is still :memory:. Please set QDRANT_URL env var for production.")
# We continue anyway for testing logic, but warn user.
# 1. Load Data (101k samples for production proof)
# For demo speed, we might start with 10k, but let's aim for 20k to be significant.
N_SAMPLES = 1000
print(f"Loading {N_SAMPLES} samples from MS MARCO...")
raw_texts = load_ms_marco(N_SAMPLES)
# 2. Generate Embeddings
# Use 'nomic' or 'minilm'. Let's stick to 'minilm' for speed/reliability in this demo unless specified.
# Config says 'nomic' is primary, but 'minilm' is baseline.
# Let's use 'minilm' for the first pass to ensure it works, or 'nomic' if we want MRL power.
# The prompt mentioned MRL optimization, so 'nomic' is better if we want real MRL.
# However, 'minilm' is 384 dims. 'nomic' is 768.
# Our config MRL_DIMS is 64.
# Let's use 'minilm' as it's faster to download/run on CPU if needed.
MODEL_NAME = EMBEDDING_MODELS["minilm"]
print(f"Generating embeddings using {MODEL_NAME}...")
embeddings = get_embeddings(MODEL_NAME, raw_texts)
vector_dim = embeddings.shape[1]
# 3. Train Router
# We need to train the router on this data to cluster it.
print("Training Router...")
router = LearnedRouter(model_type="lightgbm", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
router.train(embeddings)
# Save Router
os.makedirs("models", exist_ok=True)
router.save("models/router_v1.pkl")
# 4. Assign Clusters (Ground Truth for Indexing)
print("Assigning clusters...")
# We use the router's internal KMeans to get the "Ground Truth" cluster for each point.
# This ensures that the data actually lives where the router *should* predict it to be (mostly).
cluster_ids = router.kmeans.predict(embeddings)
# 5. Index to Qdrant
print("Initializing Qdrant...")
db = UnifiedQdrant(
collection_name=COLLECTION_NAME,
vector_size=vector_dim,
num_clusters=NUM_CLUSTERS,
freshness_shard_id=FRESHNESS_SHARD_ID
)
db.initialize()
print("Indexing data...")
# Batching is handled inside index_data somewhat, but let's pass it all
# The index_data method groups by shard, which is efficient for custom sharding.
payloads = [{"text": t, "origin": "ms_marco"} for t in raw_texts]
# We can process in chunks to avoid OOM if 20k is too big for memory (it's fine for 20k).
db.index_data(embeddings, payloads, cluster_ids)
print(">>> Ingestion Complete!")
if __name__ == "__main__":
ingest_data()