Spaces:

dashVector
/

dashVectorSpace

Sleeping

App Files Files Community

justmotes commited on Dec 6, 2025

Commit

9a9f1fb

verified ·

1 Parent(s): 51fc709

Deploy 9-Row Benchmark (via API)

Browse files

Files changed (10) hide show

COLAB_INSTRUCTIONS.md +26 -0
README.md +0 -11
scripts/copy_models.py +21 -0
scripts/debug_conn.py +22 -0
scripts/debug_qdrant_client.py +32 -0
scripts/diagnostic.py +94 -0
scripts/ingest_full_benchmark.py +149 -0
scripts/ingest_ms_marco.py +31 -16
scripts/train_routers_only.py +49 -0
src/vector_db.py +27 -13

COLAB_INSTRUCTIONS.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Running dashVectorspace on Google Colab
+Since the ingestion process can be compute-intensive, running on Google Colab (especially with a GPU) is a great option.
+## Steps
+3.  **Setup Environment**:
+    Open the notebook `notebooks/dashVector_full_benchmark.ipynb`.
+    Since it clones the code directly from Hugging Face, you do **not** need to upload any zip files.
+    Simply **Run All Cells**.
+    The notebook will:
+    - Clone `https://huggingface.co/spaces/dashVector/dashVectorSpace`
+    - Install dependencies
+    - Run the 25k Benchmark
+    - Prompt to download artifacts
+4.  **Download Artifacts**:
+    The notebook will save models to the `models/` directory.
+    Zip and download this folder to your local machine:
+    ```python
+    !zip -r models.zip models
+    from google.colab import files
+    files.download('models.zip')
+    ```
+    Then, place these files into `dashVectorspace/models/` and re-deploy.

README.md CHANGED Viewed

@@ -1,14 +1,3 @@
----
-title: dashVectorSpace
-emoji: 🚀
-colorFrom: blue
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.0.0
-app_file: app.py
-pinned: false
----
 # dashVectorspace (xVector)
 **Production-Grade Learned Hybrid Retrieval Engine**













1	# dashVectorspace (xVector)
2
3	Production-Grade Learned Hybrid Retrieval Engine

scripts/copy_models.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import shutil
+import glob
+def copy_models():
+    src_dir = "models"
+    dest_dir = "hf_deploy/models"
+    os.makedirs(dest_dir, exist_ok=True)
+    # Copy Routers (.pkl)
+    patterns = ["router_*.pkl", "shard_sizes_*.json"]
+    for pattern in patterns:
+        files = glob.glob(os.path.join(src_dir, pattern))
+        for f in files:
+            shutil.copy(f, dest_dir)
+            print(f"Copied {f} to {dest_dir}")
+if __name__ == "__main__":
+    copy_models()

scripts/debug_conn.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import httpx
+import os
+url = "https://justmotes-xvector-db-node.hf.space/collections"
+headers = {"api-key": "xvector_secret_pass_123"}
+print(f"Testing connection to {url}...")
+try:
+    with httpx.Client(http2=True, verify=False) as client:
+        r = client.get(url, headers=headers, timeout=10)
+        print(f"HTTP/2 Status: {r.status_code}")
+        print(r.text[:200])
+except Exception as e:
+    print(f"HTTP/2 Failed: {e}")
+try:
+    with httpx.Client(http2=False, verify=False) as client:
+        r = client.get(url, headers=headers, timeout=10)
+        print(f"HTTP/1.1 Status: {r.status_code}")
+        print(r.text[:200])
+except Exception as e:
+    print(f"HTTP/1.1 Failed: {e}")

scripts/debug_qdrant_client.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from qdrant_client import QdrantClient
+import os
+url = "https://justmotes-xvector-db-node.hf.space"
+api_key = "xvector_secret_pass_123"
+print(f"Testing QdrantClient to {url}...")
+try:
+    print("Attempt 1: Standard (verify=False, check_compatibility=False)")
+    client = QdrantClient(url=url, api_key=api_key, timeout=10, verify=False, check_compatibility=False)
+    print(client.get_collections())
+    print("Success 1")
+except Exception as e:
+    print(f"Failed 1: {e}")
+try:
+    print("\nAttempt 2: prefer_grpc=False")
+    client = QdrantClient(url=url, api_key=api_key, timeout=10, verify=False, check_compatibility=False, prefer_grpc=False)
+    print(client.get_collections())
+    print("Success 2")
+except Exception as e:
+    print(f"Failed 2: {e}")
+try:
+    print("\nAttempt 3: port=443")
+    client = QdrantClient(url=url, port=443, api_key=api_key, timeout=10, verify=False, check_compatibility=False)
+    print(client.get_collections())
+    print(f"Exists 'dashVector_v1': {client.collection_exists('dashVector_v1')}")
+    print("Success 3")
+except Exception as e:
+    print(f"Failed 3: {e}")

scripts/diagnostic.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import sys
+import time
+# Fix path to allow importing from src
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from src.vector_db import UnifiedQdrant
+from src.router import LearnedRouter
+from src.comparison import ComparisonEngine
+from config import COLLECTION_NAME, NUM_CLUSTERS, FRESHNESS_SHARD_ID, MRL_DIMS
+# Force Cloud Config for Diagnostic
+os.environ["QDRANT_URL"] = "https://justmotes-xvector-db-node.hf.space"
+os.environ["QDRANT_API_KEY"] = "xvector_secret_pass_123"
+def run_diagnostic():
+    print(">>> Starting Diagnostic Check...")
+    # 1. Check Qdrant Connection
+    print("\n1. Checking Qdrant Connection...")
+    try:
+        db = UnifiedQdrant(
+            collection_name=COLLECTION_NAME,
+            vector_size=384,
+            num_clusters=NUM_CLUSTERS,
+            freshness_shard_id=FRESHNESS_SHARD_ID
+        )
+        print("   - Initializing UnifiedQdrant...")
+        db.initialize()
+        # Check specific collection
+        if db.client.collection_exists(COLLECTION_NAME):
+            info = db.client.get_collection(COLLECTION_NAME)
+            print(f"✅ Collection '{COLLECTION_NAME}' exists.")
+            print(f"   - Status: {info.status}")
+            print(f"   - Points: {info.points_count}")
+            if info.points_count == 0:
+                print("⚠️ WARNING: Collection is empty! Ingestion might have failed.")
+        else:
+            print(f"❌ Collection '{COLLECTION_NAME}' does NOT exist.")
+            return
+    except Exception as e:
+        print(f"❌ Qdrant Connection Failed: {e}")
+        return
+    # 2. Check Router
+    print("\n2. Checking Router Model...")
+    router_path = "models/router_v1.pkl"
+    if os.path.exists(router_path):
+        try:
+            router = LearnedRouter.load(router_path)
+            print(f"✅ Router loaded from {router_path}")
+            print(f"   - Clusters: {router.n_clusters}")
+        except Exception as e:
+            print(f"❌ Failed to load router: {e}")
+            return
+    else:
+        print(f"❌ Router file not found at {router_path}")
+        return
+    # 3. Test Search Logic
+    print("\n3. Testing Search Logic...")
+    try:
+        engine = ComparisonEngine(db, router, embedding_model_name="minilm")
+        query = "What is the capital of France?"
+        print(f"   - Query: '{query}'")
+        # Direct Search
+        print("   - Running Direct Search...")
+        res_direct = engine.direct_search(query)
+        print(f"     -> Found {len(res_direct['results'])} results. Latency: {res_direct['latency_ms']:.2f}ms")
+        # xVector Search
+        print("   - Running xVector Search...")
+        res_xvector = engine.xvector_search(query)
+        print(f"     -> Found {len(res_xvector['results'])} results. Latency: {res_xvector['latency_ms']:.2f}ms")
+        print(f"     -> Mode: {res_xvector['mode']}")
+        print(f"     -> Target Cluster: {res_xvector.get('target_cluster')}")
+        if len(res_direct['results']) > 0:
+            print("✅ Search Logic Verified.")
+        else:
+            print("⚠️ Search returned 0 results. Data might be missing or embeddings mismatched.")
+    except Exception as e:
+        print(f"❌ Search Logic Failed: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    run_diagnostic()

scripts/ingest_full_benchmark.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import sys
+import os
+import numpy as np
+import json
+from tqdm import tqdm
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Configuration
+N_SAMPLES = 25000 # Full Benchmark
+NUM_CLUSTERS = 32 # Production Cluster Count
+FRESHNESS_SHARD_ID = 0
+from config import (
+    MRL_DIMS,
+    EMBEDDING_MODELS, ROUTER_MODELS, COLLECTIONS,
+    QDRANT_URL, QDRANT_API_KEY
+)
+from src.data_pipeline import get_embeddings, load_ms_marco
+from src.router import LearnedRouter
+from src.vector_db import UnifiedQdrant
+def ingest_full_benchmark():
+    print(">>> Starting Full Benchmark Ingestion Pipeline...")
+    # 1. Load Data
+    # 1. Load Data
+    print(f"Loading {N_SAMPLES} samples from MS MARCO...")
+    raw_texts = load_ms_marco(N_SAMPLES)
+    # Loop through each embedding model
+    for model_key, model_name in EMBEDDING_MODELS.items():
+        print(f"\n==================================================")
+        print(f"Processing Embedding Model: {model_key.upper()} ({model_name})")
+        print(f"==================================================")
+        # 2. Generate Embeddings
+        print(f"Generating embeddings...")
+        embeddings = get_embeddings(model_name, raw_texts)
+        vector_dim = embeddings.shape[1]
+        print(f"Embeddings generated. Shape: {embeddings.shape}")
+        # Save Model Info (Dimension) for App
+        model_info_path = f"models/model_info_{model_key}.json"
+        with open(model_info_path, "w") as f:
+            json.dump({"dim": vector_dim}, f)
+        print(f"Saved model info to {model_info_path}")
+        # 3. Baseline Collection (Unsharded)
+        base_col_name = COLLECTIONS[model_key]["base"]
+        print(f"\n--- Setting up Baseline Collection: {base_col_name} ---")
+        db_base = UnifiedQdrant(
+            collection_name=base_col_name,
+            vector_size=vector_dim,
+            num_clusters=1 # Unsharded
+        )
+        db_base.initialize(is_baseline=True)
+        print(f"Indexing data into Baseline...")
+        payloads = [{"text": text, "source": "ms_marco"} for text in raw_texts]
+        db_base.index_data(embeddings, payloads, cluster_ids=None) # None = Standard Upsert
+        print("Baseline Indexing Complete.")
+        # 4. Train Routers & Prod Collection (Sharded)
+        prod_col_name = COLLECTIONS[model_key]["prod"]
+        print(f"\n--- Setting up Prod Collection: {prod_col_name} ---")
+        # We need "Ground Truth" labels for indexing.
+        # Ideally, we use the router's training labels (KMeans labels).
+        # We train the routers first.
+        # We will use the labels from the FIRST router training (e.g., Logistic)
+        # as the ground truth for physical sharding.
+        # Or better, we explicitly run KMeans once to define the physical shards,
+        # and then train all routers to predict those labels.
+        # LearnedRouter.train does KMeans internally.
+        # Let's instantiate a "Master" router just for KMeans/Sharding.
+        print("Running K-Means to define Physical Shards...")
+        # We can use the 'logistic' router class to do this, or just use KMeans directly.
+        # Let's use the router class to keep it consistent.
+        master_router = LearnedRouter(model_type="logistic", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
+        # We access the internal logic or just train it and use its labels.
+        master_router.train(embeddings)
+        cluster_labels = master_router.kmeans.labels_ # Get the labels
+        # Now we have the physical shard assignment (cluster_labels)
+        # Initialize Prod DB
+        db_prod = UnifiedQdrant(
+            collection_name=prod_col_name,
+            vector_size=vector_dim,
+            num_clusters=NUM_CLUSTERS,
+            freshness_shard_id=FRESHNESS_SHARD_ID
+        )
+        db_prod.initialize(is_baseline=False)
+        print(f"Indexing data into Prod (Sharded)...")
+        target_clusters = [int(c) for c in cluster_labels]
+        db_prod.index_data(embeddings, payloads, cluster_ids=target_clusters)
+        # Save Shard Sizes
+        print("Saving Shard Sizes...")
+        shard_sizes = db_prod.get_shard_sizes()
+        size_path = f"models/shard_sizes_{model_key}.json"
+        with open(size_path, "w") as f:
+            json.dump(shard_sizes, f)
+        print(f"Shard sizes saved to {size_path}")
+        # 5. Train & Save All Routers
+        # We already trained 'logistic' (master_router), but let's re-save/train loop for clarity
+        # and to ensure they all predict the SAME KMeans clusters.
+        # Wait, if we re-train KMeans inside each router, they might converge to DIFFERENT clusters!
+        # CRITICAL: They must share the same KMeans model (Physical Layout).
+        print("\n--- Training Routers ---")
+        kmeans_model = master_router.kmeans # Reuse this!
+        for router_type in ROUTER_MODELS:
+            print(f"Training {router_type.upper()}...")
+            # We need a way to inject the pre-trained KMeans into the router
+            # so it learns to predict THESE specific clusters.
+            # LearnedRouter currently runs KMeans in .train().
+            # We should modify LearnedRouter or hack it.
+            # Hack: Initialize router, set .kmeans = kmeans_model, then train ONLY the classifier.
+            router = LearnedRouter(model_type=router_type, n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
+            router.kmeans = kmeans_model # Inject shared KMeans
+            # We need a method to train ONLY classifier.
+            # Let's add a 'train_classifier' method to LearnedRouter or modify 'train'.
+            # For now, I will assume I need to modify router.py to support this.
+            # But to avoid breaking changes mid-script, I'll do it in the script if possible.
+            # Actually, I'll modify router.py in the next step to allow passing 'labels'.
+            # Assuming I update router.py to accept 'labels' in train():
+            router.train(embeddings, labels=cluster_labels)
+            save_name = f"router_{model_key}_{router_type}.pkl"
+            save_path = os.path.abspath(f"models/{save_name}")
+            os.makedirs(os.path.dirname(save_path), exist_ok=True)
+            router.save(save_path)
+            print(f"Saved {save_name}")
+    print("\n>>> Full Benchmark Ingestion Complete!")
+if __name__ == "__main__":
+    ingest_full_benchmark()

scripts/ingest_ms_marco.py CHANGED Viewed

@@ -15,6 +15,8 @@ from src.data_pipeline import get_embeddings, load_ms_marco
 from src.router import LearnedRouter
 from src.vector_db import UnifiedQdrant
 def ingest_data():
     print(">>> Starting Ingestion Pipeline for Qdrant Cloud...")
@@ -24,7 +26,7 @@ def ingest_data():
     # 1. Load Data (101k samples for production proof)
     # For demo speed, we might start with 10k, but let's aim for 20k to be significant.
-    N_SAMPLES = 1000
     print(f"Loading {N_SAMPLES} samples from MS MARCO...")
     raw_texts = load_ms_marco(N_SAMPLES)
@@ -42,41 +44,54 @@ def ingest_data():
     vector_dim = embeddings.shape[1]
     # 3. Train Router
-    # We need to train the router on this data to cluster it.
     print("Training Router...")
     router = LearnedRouter(model_type="lightgbm", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
     router.train(embeddings)
-    # Save Router
-    os.makedirs("models", exist_ok=True)
-    router.save("models/router_v1.pkl")
-    # 4. Assign Clusters (Ground Truth for Indexing)
     print("Assigning clusters...")
-    # We use the router's internal KMeans to get the "Ground Truth" cluster for each point.
-    # This ensures that the data actually lives where the router *should* predict it to be (mostly).
-    cluster_ids = router.kmeans.predict(embeddings)
-    # 5. Index to Qdrant
     print("Initializing Qdrant...")
     db = UnifiedQdrant(
         collection_name=COLLECTION_NAME,
         vector_size=vector_dim,
-        num_clusters=NUM_CLUSTERS,
         freshness_shard_id=FRESHNESS_SHARD_ID
     )
     db.initialize()
     print("Indexing data...")
-    # Batching is handled inside index_data somewhat, but let's pass it all
-    # The index_data method groups by shard, which is efficient for custom sharding.
-    payloads = [{"text": t, "origin": "ms_marco"} for t in raw_texts]
-    # We can process in chunks to avoid OOM if 20k is too big for memory (it's fine for 20k).
-    db.index_data(embeddings, payloads, cluster_ids)
     print(">>> Ingestion Complete!")
 if __name__ == "__main__":
     ingest_data()

 from src.router import LearnedRouter
 from src.vector_db import UnifiedQdrant
+ROUTER_PATH = "models/router_v1.pkl"
 def ingest_data():
     print(">>> Starting Ingestion Pipeline for Qdrant Cloud...")
     # 1. Load Data (101k samples for production proof)
     # For demo speed, we might start with 10k, but let's aim for 20k to be significant.
+    N_SAMPLES = 25000
     print(f"Loading {N_SAMPLES} samples from MS MARCO...")
     raw_texts = load_ms_marco(N_SAMPLES)
     vector_dim = embeddings.shape[1]
     # 3. Train Router
     print("Training Router...")
     router = LearnedRouter(model_type="lightgbm", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
+    # Train router (runs KMeans internally if labels not provided)
     router.train(embeddings)
+    # Get cluster labels for indexing
+    cluster_labels = router.kmeans.predict(embeddings)
+    print("Router training complete.")
+    # Save Router
+    abs_router_path = os.path.abspath(ROUTER_PATH)
+    print(f"Saving router to {abs_router_path}...")
+    os.makedirs(os.path.dirname(abs_router_path), exist_ok=True)
+    router.save(abs_router_path)
+    print("Router saved.")
+    # 4. Index Data
     print("Assigning clusters...")
+    # For indexing, we need to know which cluster each point belongs to
+    # We already have cluster_labels from KMeans
     print("Initializing Qdrant...")
     db = UnifiedQdrant(
         collection_name=COLLECTION_NAME,
         vector_size=vector_dim,
+        num_clusters=NUM_CLUSTERS,
         freshness_shard_id=FRESHNESS_SHARD_ID
     )
     db.initialize()
     print("Indexing data...")
+    # Convert embeddings to list of lists if needed, but numpy is fine for our method
+    # Create dummy payloads
+    payloads = [{"text": text, "source": "ms_marco"} for text in raw_texts]
+    # Use the cluster labels as the target shards
+    # Note: cluster_labels are int32, convert to int
+    target_clusters = [int(c) for c in cluster_labels]
+    db.index_data(embeddings, payloads, target_clusters)
+    # Verify Count
+    print("Verifying index count...")
+    info = db.client.get_collection(COLLECTION_NAME)
+    print(f"Collection '{COLLECTION_NAME}' has {info.points_count} points.")
     print(">>> Ingestion Complete!")
 if __name__ == "__main__":
     ingest_data()

scripts/train_routers_only.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import sys
+import os
+import numpy as np
+from tqdm import tqdm
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from config import (
+    NUM_CLUSTERS, MRL_DIMS, EMBEDDING_MODELS
+)
+from src.data_pipeline import get_embeddings, load_ms_marco
+from src.router import LearnedRouter
+def train_routers():
+    print(">>> Starting Router Training Pipeline...")
+    # 1. Load Data
+    N_SAMPLES = 25000
+    print(f"Loading {N_SAMPLES} samples from MS MARCO...")
+    raw_texts = load_ms_marco(N_SAMPLES)
+    # 2. Generate Embeddings (MiniLM-L6-v2)
+    MODEL_NAME = EMBEDDING_MODELS["minilm"]
+    print(f"Generating embeddings using {MODEL_NAME}...")
+    embeddings = get_embeddings(MODEL_NAME, raw_texts)
+    # 3. Train & Save Routers
+    models_to_train = [
+        ("logistic", "router_logistic.pkl"),
+        ("lightgbm", "router_lightgbm.pkl"),
+        ("mlp", "router_mlp.pkl")
+    ]
+    for model_type, filename in models_to_train:
+        print(f"\n--- Training {model_type.upper()} Router ---")
+        router = LearnedRouter(model_type=model_type, n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
+        router.train(embeddings)
+        save_path = os.path.abspath(f"models/{filename}")
+        print(f"Saving to {save_path}...")
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        router.save(save_path)
+        print(f"{model_type.upper()} Router saved.")
+    print("\n>>> All Routers Trained & Saved!")
+if __name__ == "__main__":
+    train_routers()

src/vector_db.py CHANGED Viewed

@@ -104,10 +104,13 @@ class UnifiedQdrant:
     def index_data(self, vectors: np.ndarray, payloads: List[Dict[str, Any]], cluster_ids: List[Optional[int]] = None):
         """
-        Indexes data.
         If cluster_ids provided, uses custom sharding (Prod).
         If cluster_ids is None, uses standard upsert (Baseline/Local).
         """
         if cluster_ids is None or self.is_local:
             # Standard Upsert
             points = [
@@ -117,11 +120,16 @@ class UnifiedQdrant:
                     payload=payloads[i]
                 ) for i, vec in enumerate(vectors)
             ]
-            # Batching is better, but for simplicity:
-            self.client.upsert(
-                collection_name=self.collection_name,
-                points=points
-            )
             return
         # Custom Sharding Upsert
@@ -141,13 +149,19 @@ class UnifiedQdrant:
                 )
             )
-        print(f"Indexing data across {len(data_by_shard)} shards...")
-        for key, batch_points in data_by_shard.items():
-            self.client.upsert(
-                collection_name=self.collection_name,
-                points=batch_points,
-                shard_key_selector=key
-            )
     def search_hybrid(self, query_vec: np.ndarray, target_clusters: List[int], confidence: float) -> List[Any]:
         """

     def index_data(self, vectors: np.ndarray, payloads: List[Dict[str, Any]], cluster_ids: List[Optional[int]] = None):
         """
+        Indexes data with batching to avoid payload limits.
         If cluster_ids provided, uses custom sharding (Prod).
         If cluster_ids is None, uses standard upsert (Baseline/Local).
+        BATCH_SIZE hardcoded to 500 for safety.
         """
+        BATCH_SIZE = 500
         if cluster_ids is None or self.is_local:
             # Standard Upsert
             points = [
                     payload=payloads[i]
                 ) for i, vec in enumerate(vectors)
             ]
+            # Batching
+            total = len(points)
+            print(f"Upserting {total} points to '{self.collection_name}' (Standard)...")
+            for i in range(0, total, BATCH_SIZE):
+                batch = points[i : i + BATCH_SIZE]
+                self.client.upsert(
+                    collection_name=self.collection_name,
+                    points=batch
+                )
             return
         # Custom Sharding Upsert
                 )
             )
+        print(f"Indexing data across {len(data_by_shard)} shards (Custom Sharded)...")
+        for key, shard_points in data_by_shard.items():
+            # Also batch per shard if needed (though unlikely to exceed 32MB per shard with 25k samples)
+            # 25k samples / 32 shards ~= 800 points per shard. 800 * 8KB << 32MB.
+            # But safe is safe.
+            total_shard = len(shard_points)
+            for i in range(0, total_shard, BATCH_SIZE):
+                batch = shard_points[i : i + BATCH_SIZE]
+                self.client.upsert(
+                    collection_name=self.collection_name,
+                    points=batch,
+                    shard_key_selector=key
+                )
     def search_hybrid(self, query_vec: np.ndarray, target_clusters: List[int], confidence: float) -> List[Any]:
         """