Spaces:

dashVector
/

dashVectorSpace

Sleeping

File size: 3,588 Bytes

b92d96d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a9f1fb
 
b92d96d
 
 
 
 
 
 
 
 
9a9f1fb
b92d96d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a9f1fb
b92d96d
 
9a9f1fb
 
 
b92d96d
9a9f1fb
 
 
 
 
 
 
 
b92d96d
9a9f1fb
 
b92d96d
 
 
 
 
9a9f1fb
b92d96d
 
 
 
 
9a9f1fb
 
 
b92d96d
9a9f1fb
 
 
b92d96d
9a9f1fb
b92d96d
9a9f1fb
 
 
 
 
b92d96d
 
 
 
9a9f1fb

import sys
import os
import numpy as np
from tqdm import tqdm

# Add project root to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from config import (
    NUM_CLUSTERS, FRESHNESS_SHARD_ID, MRL_DIMS, 
    EMBEDDING_MODELS, ROUTER_MODELS, COLLECTION_NAME,
    QDRANT_URL, QDRANT_API_KEY
)
from src.data_pipeline import get_embeddings, load_ms_marco
from src.router import LearnedRouter
from src.vector_db import UnifiedQdrant

ROUTER_PATH = "models/router_v1.pkl"

def ingest_data():
    print(">>> Starting Ingestion Pipeline for Qdrant Cloud...")
    
    if QDRANT_URL == ":memory:":
        print("WARNING: QDRANT_URL is still :memory:. Please set QDRANT_URL env var for production.")
        # We continue anyway for testing logic, but warn user.

    # 1. Load Data (101k samples for production proof)
    # For demo speed, we might start with 10k, but let's aim for 20k to be significant.
    N_SAMPLES = 25000 
    print(f"Loading {N_SAMPLES} samples from MS MARCO...")
    raw_texts = load_ms_marco(N_SAMPLES)
    
    # 2. Generate Embeddings
    # Use 'nomic' or 'minilm'. Let's stick to 'minilm' for speed/reliability in this demo unless specified.
    # Config says 'nomic' is primary, but 'minilm' is baseline.
    # Let's use 'minilm' for the first pass to ensure it works, or 'nomic' if we want MRL power.
    # The prompt mentioned MRL optimization, so 'nomic' is better if we want real MRL.
    # However, 'minilm' is 384 dims. 'nomic' is 768.
    # Our config MRL_DIMS is 64.
    # Let's use 'minilm' as it's faster to download/run on CPU if needed.
    MODEL_NAME = EMBEDDING_MODELS["minilm"] 
    print(f"Generating embeddings using {MODEL_NAME}...")
    embeddings = get_embeddings(MODEL_NAME, raw_texts)
    vector_dim = embeddings.shape[1]
    
    # 3. Train Router
    print("Training Router...")
    router = LearnedRouter(model_type="lightgbm", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
    # Train router (runs KMeans internally if labels not provided)
    router.train(embeddings)
    
    # Get cluster labels for indexing
    cluster_labels = router.kmeans.predict(embeddings)
    print("Router training complete.")
    
    # Save Router
    abs_router_path = os.path.abspath(ROUTER_PATH)
    print(f"Saving router to {abs_router_path}...")
    os.makedirs(os.path.dirname(abs_router_path), exist_ok=True)
    router.save(abs_router_path)
    print("Router saved.")

    # 4. Index Data
    print("Assigning clusters...")
    # For indexing, we need to know which cluster each point belongs to
    # We already have cluster_labels from KMeans
    
    print("Initializing Qdrant...")
    db = UnifiedQdrant(
        collection_name=COLLECTION_NAME, 
        vector_size=vector_dim, 
        num_clusters=NUM_CLUSTERS,
        freshness_shard_id=FRESHNESS_SHARD_ID
    )
    db.initialize()
    
    print("Indexing data...")
    # Convert embeddings to list of lists if needed, but numpy is fine for our method
    # Create dummy payloads
    payloads = [{"text": text, "source": "ms_marco"} for text in raw_texts]
    
    # Use the cluster labels as the target shards
    # Note: cluster_labels are int32, convert to int
    target_clusters = [int(c) for c in cluster_labels]
    
    db.index_data(embeddings, payloads, target_clusters)
    
    # Verify Count
    print("Verifying index count...")
    info = db.client.get_collection(COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' has {info.points_count} points.")

    print(">>> Ingestion Complete!")

if __name__ == "__main__":
    ingest_data()