justmotes commited on
Commit
9a9f1fb
·
verified ·
1 Parent(s): 51fc709

Deploy 9-Row Benchmark (via API)

Browse files
COLAB_INSTRUCTIONS.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Running dashVectorspace on Google Colab
2
+
3
+ Since the ingestion process can be compute-intensive, running on Google Colab (especially with a GPU) is a great option.
4
+
5
+ ## Steps
6
+
7
+ 3. **Setup Environment**:
8
+ Open the notebook `notebooks/dashVector_full_benchmark.ipynb`.
9
+ Since it clones the code directly from Hugging Face, you do **not** need to upload any zip files.
10
+
11
+ Simply **Run All Cells**.
12
+ The notebook will:
13
+ - Clone `https://huggingface.co/spaces/dashVector/dashVectorSpace`
14
+ - Install dependencies
15
+ - Run the 25k Benchmark
16
+ - Prompt to download artifacts
17
+
18
+ 4. **Download Artifacts**:
19
+ The notebook will save models to the `models/` directory.
20
+ Zip and download this folder to your local machine:
21
+ ```python
22
+ !zip -r models.zip models
23
+ from google.colab import files
24
+ files.download('models.zip')
25
+ ```
26
+ Then, place these files into `dashVectorspace/models/` and re-deploy.
README.md CHANGED
@@ -1,14 +1,3 @@
1
- ---
2
- title: dashVectorSpace
3
- emoji: 🚀
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.0.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
  # dashVectorspace (xVector)
13
 
14
  **Production-Grade Learned Hybrid Retrieval Engine**
 
 
 
 
 
 
 
 
 
 
 
 
1
  # dashVectorspace (xVector)
2
 
3
  **Production-Grade Learned Hybrid Retrieval Engine**
scripts/copy_models.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import glob
4
+
5
+ def copy_models():
6
+ src_dir = "models"
7
+ dest_dir = "hf_deploy/models"
8
+
9
+ os.makedirs(dest_dir, exist_ok=True)
10
+
11
+ # Copy Routers (.pkl)
12
+ patterns = ["router_*.pkl", "shard_sizes_*.json"]
13
+
14
+ for pattern in patterns:
15
+ files = glob.glob(os.path.join(src_dir, pattern))
16
+ for f in files:
17
+ shutil.copy(f, dest_dir)
18
+ print(f"Copied {f} to {dest_dir}")
19
+
20
+ if __name__ == "__main__":
21
+ copy_models()
scripts/debug_conn.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ import os
3
+
4
+ url = "https://justmotes-xvector-db-node.hf.space/collections"
5
+ headers = {"api-key": "xvector_secret_pass_123"}
6
+
7
+ print(f"Testing connection to {url}...")
8
+ try:
9
+ with httpx.Client(http2=True, verify=False) as client:
10
+ r = client.get(url, headers=headers, timeout=10)
11
+ print(f"HTTP/2 Status: {r.status_code}")
12
+ print(r.text[:200])
13
+ except Exception as e:
14
+ print(f"HTTP/2 Failed: {e}")
15
+
16
+ try:
17
+ with httpx.Client(http2=False, verify=False) as client:
18
+ r = client.get(url, headers=headers, timeout=10)
19
+ print(f"HTTP/1.1 Status: {r.status_code}")
20
+ print(r.text[:200])
21
+ except Exception as e:
22
+ print(f"HTTP/1.1 Failed: {e}")
scripts/debug_qdrant_client.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qdrant_client import QdrantClient
2
+ import os
3
+
4
+ url = "https://justmotes-xvector-db-node.hf.space"
5
+ api_key = "xvector_secret_pass_123"
6
+
7
+ print(f"Testing QdrantClient to {url}...")
8
+
9
+ try:
10
+ print("Attempt 1: Standard (verify=False, check_compatibility=False)")
11
+ client = QdrantClient(url=url, api_key=api_key, timeout=10, verify=False, check_compatibility=False)
12
+ print(client.get_collections())
13
+ print("Success 1")
14
+ except Exception as e:
15
+ print(f"Failed 1: {e}")
16
+
17
+ try:
18
+ print("\nAttempt 2: prefer_grpc=False")
19
+ client = QdrantClient(url=url, api_key=api_key, timeout=10, verify=False, check_compatibility=False, prefer_grpc=False)
20
+ print(client.get_collections())
21
+ print("Success 2")
22
+ except Exception as e:
23
+ print(f"Failed 2: {e}")
24
+
25
+ try:
26
+ print("\nAttempt 3: port=443")
27
+ client = QdrantClient(url=url, port=443, api_key=api_key, timeout=10, verify=False, check_compatibility=False)
28
+ print(client.get_collections())
29
+ print(f"Exists 'dashVector_v1': {client.collection_exists('dashVector_v1')}")
30
+ print("Success 3")
31
+ except Exception as e:
32
+ print(f"Failed 3: {e}")
scripts/diagnostic.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+
5
+ # Fix path to allow importing from src
6
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
+
8
+ from src.vector_db import UnifiedQdrant
9
+ from src.router import LearnedRouter
10
+ from src.comparison import ComparisonEngine
11
+ from config import COLLECTION_NAME, NUM_CLUSTERS, FRESHNESS_SHARD_ID, MRL_DIMS
12
+
13
+ # Force Cloud Config for Diagnostic
14
+ os.environ["QDRANT_URL"] = "https://justmotes-xvector-db-node.hf.space"
15
+ os.environ["QDRANT_API_KEY"] = "xvector_secret_pass_123"
16
+
17
+ def run_diagnostic():
18
+ print(">>> Starting Diagnostic Check...")
19
+
20
+ # 1. Check Qdrant Connection
21
+ print("\n1. Checking Qdrant Connection...")
22
+ try:
23
+ db = UnifiedQdrant(
24
+ collection_name=COLLECTION_NAME,
25
+ vector_size=384,
26
+ num_clusters=NUM_CLUSTERS,
27
+ freshness_shard_id=FRESHNESS_SHARD_ID
28
+ )
29
+ print(" - Initializing UnifiedQdrant...")
30
+ db.initialize()
31
+
32
+ # Check specific collection
33
+ if db.client.collection_exists(COLLECTION_NAME):
34
+ info = db.client.get_collection(COLLECTION_NAME)
35
+ print(f"✅ Collection '{COLLECTION_NAME}' exists.")
36
+ print(f" - Status: {info.status}")
37
+ print(f" - Points: {info.points_count}")
38
+ if info.points_count == 0:
39
+ print("⚠️ WARNING: Collection is empty! Ingestion might have failed.")
40
+ else:
41
+ print(f"❌ Collection '{COLLECTION_NAME}' does NOT exist.")
42
+ return
43
+
44
+ except Exception as e:
45
+ print(f"❌ Qdrant Connection Failed: {e}")
46
+ return
47
+
48
+ # 2. Check Router
49
+ print("\n2. Checking Router Model...")
50
+ router_path = "models/router_v1.pkl"
51
+ if os.path.exists(router_path):
52
+ try:
53
+ router = LearnedRouter.load(router_path)
54
+ print(f"✅ Router loaded from {router_path}")
55
+ print(f" - Clusters: {router.n_clusters}")
56
+ except Exception as e:
57
+ print(f"❌ Failed to load router: {e}")
58
+ return
59
+ else:
60
+ print(f"❌ Router file not found at {router_path}")
61
+ return
62
+
63
+ # 3. Test Search Logic
64
+ print("\n3. Testing Search Logic...")
65
+ try:
66
+ engine = ComparisonEngine(db, router, embedding_model_name="minilm")
67
+ query = "What is the capital of France?"
68
+
69
+ print(f" - Query: '{query}'")
70
+
71
+ # Direct Search
72
+ print(" - Running Direct Search...")
73
+ res_direct = engine.direct_search(query)
74
+ print(f" -> Found {len(res_direct['results'])} results. Latency: {res_direct['latency_ms']:.2f}ms")
75
+
76
+ # xVector Search
77
+ print(" - Running xVector Search...")
78
+ res_xvector = engine.xvector_search(query)
79
+ print(f" -> Found {len(res_xvector['results'])} results. Latency: {res_xvector['latency_ms']:.2f}ms")
80
+ print(f" -> Mode: {res_xvector['mode']}")
81
+ print(f" -> Target Cluster: {res_xvector.get('target_cluster')}")
82
+
83
+ if len(res_direct['results']) > 0:
84
+ print("✅ Search Logic Verified.")
85
+ else:
86
+ print("⚠️ Search returned 0 results. Data might be missing or embeddings mismatched.")
87
+
88
+ except Exception as e:
89
+ print(f"❌ Search Logic Failed: {e}")
90
+ import traceback
91
+ traceback.print_exc()
92
+
93
+ if __name__ == "__main__":
94
+ run_diagnostic()
scripts/ingest_full_benchmark.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import numpy as np
4
+ import json
5
+ from tqdm import tqdm
6
+
7
+ # Add project root to path
8
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9
+
10
+ # Configuration
11
+ N_SAMPLES = 25000 # Full Benchmark
12
+ NUM_CLUSTERS = 32 # Production Cluster Count
13
+ FRESHNESS_SHARD_ID = 0
14
+
15
+ from config import (
16
+ MRL_DIMS,
17
+ EMBEDDING_MODELS, ROUTER_MODELS, COLLECTIONS,
18
+ QDRANT_URL, QDRANT_API_KEY
19
+ )
20
+ from src.data_pipeline import get_embeddings, load_ms_marco
21
+ from src.router import LearnedRouter
22
+ from src.vector_db import UnifiedQdrant
23
+
24
+ def ingest_full_benchmark():
25
+ print(">>> Starting Full Benchmark Ingestion Pipeline...")
26
+
27
+ # 1. Load Data
28
+ # 1. Load Data
29
+ print(f"Loading {N_SAMPLES} samples from MS MARCO...")
30
+ raw_texts = load_ms_marco(N_SAMPLES)
31
+
32
+ # Loop through each embedding model
33
+ for model_key, model_name in EMBEDDING_MODELS.items():
34
+ print(f"\n==================================================")
35
+ print(f"Processing Embedding Model: {model_key.upper()} ({model_name})")
36
+ print(f"==================================================")
37
+
38
+ # 2. Generate Embeddings
39
+ print(f"Generating embeddings...")
40
+ embeddings = get_embeddings(model_name, raw_texts)
41
+ vector_dim = embeddings.shape[1]
42
+ print(f"Embeddings generated. Shape: {embeddings.shape}")
43
+
44
+ # Save Model Info (Dimension) for App
45
+ model_info_path = f"models/model_info_{model_key}.json"
46
+ with open(model_info_path, "w") as f:
47
+ json.dump({"dim": vector_dim}, f)
48
+ print(f"Saved model info to {model_info_path}")
49
+
50
+ # 3. Baseline Collection (Unsharded)
51
+ base_col_name = COLLECTIONS[model_key]["base"]
52
+ print(f"\n--- Setting up Baseline Collection: {base_col_name} ---")
53
+ db_base = UnifiedQdrant(
54
+ collection_name=base_col_name,
55
+ vector_size=vector_dim,
56
+ num_clusters=1 # Unsharded
57
+ )
58
+ db_base.initialize(is_baseline=True)
59
+
60
+ print(f"Indexing data into Baseline...")
61
+ payloads = [{"text": text, "source": "ms_marco"} for text in raw_texts]
62
+ db_base.index_data(embeddings, payloads, cluster_ids=None) # None = Standard Upsert
63
+ print("Baseline Indexing Complete.")
64
+
65
+ # 4. Train Routers & Prod Collection (Sharded)
66
+ prod_col_name = COLLECTIONS[model_key]["prod"]
67
+ print(f"\n--- Setting up Prod Collection: {prod_col_name} ---")
68
+
69
+ # We need "Ground Truth" labels for indexing.
70
+ # Ideally, we use the router's training labels (KMeans labels).
71
+ # We train the routers first.
72
+
73
+ # We will use the labels from the FIRST router training (e.g., Logistic)
74
+ # as the ground truth for physical sharding.
75
+ # Or better, we explicitly run KMeans once to define the physical shards,
76
+ # and then train all routers to predict those labels.
77
+ # LearnedRouter.train does KMeans internally.
78
+ # Let's instantiate a "Master" router just for KMeans/Sharding.
79
+
80
+ print("Running K-Means to define Physical Shards...")
81
+ # We can use the 'logistic' router class to do this, or just use KMeans directly.
82
+ # Let's use the router class to keep it consistent.
83
+ master_router = LearnedRouter(model_type="logistic", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
84
+ # We access the internal logic or just train it and use its labels.
85
+ master_router.train(embeddings)
86
+ cluster_labels = master_router.kmeans.labels_ # Get the labels
87
+
88
+ # Now we have the physical shard assignment (cluster_labels)
89
+
90
+ # Initialize Prod DB
91
+ db_prod = UnifiedQdrant(
92
+ collection_name=prod_col_name,
93
+ vector_size=vector_dim,
94
+ num_clusters=NUM_CLUSTERS,
95
+ freshness_shard_id=FRESHNESS_SHARD_ID
96
+ )
97
+ db_prod.initialize(is_baseline=False)
98
+
99
+ print(f"Indexing data into Prod (Sharded)...")
100
+ target_clusters = [int(c) for c in cluster_labels]
101
+ db_prod.index_data(embeddings, payloads, cluster_ids=target_clusters)
102
+
103
+ # Save Shard Sizes
104
+ print("Saving Shard Sizes...")
105
+ shard_sizes = db_prod.get_shard_sizes()
106
+ size_path = f"models/shard_sizes_{model_key}.json"
107
+ with open(size_path, "w") as f:
108
+ json.dump(shard_sizes, f)
109
+ print(f"Shard sizes saved to {size_path}")
110
+
111
+ # 5. Train & Save All Routers
112
+ # We already trained 'logistic' (master_router), but let's re-save/train loop for clarity
113
+ # and to ensure they all predict the SAME KMeans clusters.
114
+ # Wait, if we re-train KMeans inside each router, they might converge to DIFFERENT clusters!
115
+ # CRITICAL: They must share the same KMeans model (Physical Layout).
116
+
117
+ print("\n--- Training Routers ---")
118
+ kmeans_model = master_router.kmeans # Reuse this!
119
+
120
+ for router_type in ROUTER_MODELS:
121
+ print(f"Training {router_type.upper()}...")
122
+ # We need a way to inject the pre-trained KMeans into the router
123
+ # so it learns to predict THESE specific clusters.
124
+ # LearnedRouter currently runs KMeans in .train().
125
+ # We should modify LearnedRouter or hack it.
126
+ # Hack: Initialize router, set .kmeans = kmeans_model, then train ONLY the classifier.
127
+
128
+ router = LearnedRouter(model_type=router_type, n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
129
+ router.kmeans = kmeans_model # Inject shared KMeans
130
+
131
+ # We need a method to train ONLY classifier.
132
+ # Let's add a 'train_classifier' method to LearnedRouter or modify 'train'.
133
+ # For now, I will assume I need to modify router.py to support this.
134
+ # But to avoid breaking changes mid-script, I'll do it in the script if possible.
135
+ # Actually, I'll modify router.py in the next step to allow passing 'labels'.
136
+
137
+ # Assuming I update router.py to accept 'labels' in train():
138
+ router.train(embeddings, labels=cluster_labels)
139
+
140
+ save_name = f"router_{model_key}_{router_type}.pkl"
141
+ save_path = os.path.abspath(f"models/{save_name}")
142
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
143
+ router.save(save_path)
144
+ print(f"Saved {save_name}")
145
+
146
+ print("\n>>> Full Benchmark Ingestion Complete!")
147
+
148
+ if __name__ == "__main__":
149
+ ingest_full_benchmark()
scripts/ingest_ms_marco.py CHANGED
@@ -15,6 +15,8 @@ from src.data_pipeline import get_embeddings, load_ms_marco
15
  from src.router import LearnedRouter
16
  from src.vector_db import UnifiedQdrant
17
 
 
 
18
  def ingest_data():
19
  print(">>> Starting Ingestion Pipeline for Qdrant Cloud...")
20
 
@@ -24,7 +26,7 @@ def ingest_data():
24
 
25
  # 1. Load Data (101k samples for production proof)
26
  # For demo speed, we might start with 10k, but let's aim for 20k to be significant.
27
- N_SAMPLES = 1000
28
  print(f"Loading {N_SAMPLES} samples from MS MARCO...")
29
  raw_texts = load_ms_marco(N_SAMPLES)
30
 
@@ -42,41 +44,54 @@ def ingest_data():
42
  vector_dim = embeddings.shape[1]
43
 
44
  # 3. Train Router
45
- # We need to train the router on this data to cluster it.
46
  print("Training Router...")
47
  router = LearnedRouter(model_type="lightgbm", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
 
48
  router.train(embeddings)
49
 
50
- # Save Router
51
- os.makedirs("models", exist_ok=True)
52
- router.save("models/router_v1.pkl")
53
 
54
- # 4. Assign Clusters (Ground Truth for Indexing)
 
 
 
 
 
 
 
55
  print("Assigning clusters...")
56
- # We use the router's internal KMeans to get the "Ground Truth" cluster for each point.
57
- # This ensures that the data actually lives where the router *should* predict it to be (mostly).
58
- cluster_ids = router.kmeans.predict(embeddings)
59
 
60
- # 5. Index to Qdrant
61
  print("Initializing Qdrant...")
62
  db = UnifiedQdrant(
63
  collection_name=COLLECTION_NAME,
64
  vector_size=vector_dim,
65
- num_clusters=NUM_CLUSTERS,
66
  freshness_shard_id=FRESHNESS_SHARD_ID
67
  )
68
  db.initialize()
69
 
70
  print("Indexing data...")
71
- # Batching is handled inside index_data somewhat, but let's pass it all
72
- # The index_data method groups by shard, which is efficient for custom sharding.
 
73
 
74
- payloads = [{"text": t, "origin": "ms_marco"} for t in raw_texts]
 
 
75
 
76
- # We can process in chunks to avoid OOM if 20k is too big for memory (it's fine for 20k).
77
- db.index_data(embeddings, payloads, cluster_ids)
78
 
 
 
 
 
 
79
  print(">>> Ingestion Complete!")
80
 
81
  if __name__ == "__main__":
82
  ingest_data()
 
 
15
  from src.router import LearnedRouter
16
  from src.vector_db import UnifiedQdrant
17
 
18
+ ROUTER_PATH = "models/router_v1.pkl"
19
+
20
  def ingest_data():
21
  print(">>> Starting Ingestion Pipeline for Qdrant Cloud...")
22
 
 
26
 
27
  # 1. Load Data (101k samples for production proof)
28
  # For demo speed, we might start with 10k, but let's aim for 20k to be significant.
29
+ N_SAMPLES = 25000
30
  print(f"Loading {N_SAMPLES} samples from MS MARCO...")
31
  raw_texts = load_ms_marco(N_SAMPLES)
32
 
 
44
  vector_dim = embeddings.shape[1]
45
 
46
  # 3. Train Router
 
47
  print("Training Router...")
48
  router = LearnedRouter(model_type="lightgbm", n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
49
+ # Train router (runs KMeans internally if labels not provided)
50
  router.train(embeddings)
51
 
52
+ # Get cluster labels for indexing
53
+ cluster_labels = router.kmeans.predict(embeddings)
54
+ print("Router training complete.")
55
 
56
+ # Save Router
57
+ abs_router_path = os.path.abspath(ROUTER_PATH)
58
+ print(f"Saving router to {abs_router_path}...")
59
+ os.makedirs(os.path.dirname(abs_router_path), exist_ok=True)
60
+ router.save(abs_router_path)
61
+ print("Router saved.")
62
+
63
+ # 4. Index Data
64
  print("Assigning clusters...")
65
+ # For indexing, we need to know which cluster each point belongs to
66
+ # We already have cluster_labels from KMeans
 
67
 
 
68
  print("Initializing Qdrant...")
69
  db = UnifiedQdrant(
70
  collection_name=COLLECTION_NAME,
71
  vector_size=vector_dim,
72
+ num_clusters=NUM_CLUSTERS,
73
  freshness_shard_id=FRESHNESS_SHARD_ID
74
  )
75
  db.initialize()
76
 
77
  print("Indexing data...")
78
+ # Convert embeddings to list of lists if needed, but numpy is fine for our method
79
+ # Create dummy payloads
80
+ payloads = [{"text": text, "source": "ms_marco"} for text in raw_texts]
81
 
82
+ # Use the cluster labels as the target shards
83
+ # Note: cluster_labels are int32, convert to int
84
+ target_clusters = [int(c) for c in cluster_labels]
85
 
86
+ db.index_data(embeddings, payloads, target_clusters)
 
87
 
88
+ # Verify Count
89
+ print("Verifying index count...")
90
+ info = db.client.get_collection(COLLECTION_NAME)
91
+ print(f"Collection '{COLLECTION_NAME}' has {info.points_count} points.")
92
+
93
  print(">>> Ingestion Complete!")
94
 
95
  if __name__ == "__main__":
96
  ingest_data()
97
+
scripts/train_routers_only.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+
6
+ # Add project root to path
7
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+
9
+ from config import (
10
+ NUM_CLUSTERS, MRL_DIMS, EMBEDDING_MODELS
11
+ )
12
+ from src.data_pipeline import get_embeddings, load_ms_marco
13
+ from src.router import LearnedRouter
14
+
15
+ def train_routers():
16
+ print(">>> Starting Router Training Pipeline...")
17
+
18
+ # 1. Load Data
19
+ N_SAMPLES = 25000
20
+ print(f"Loading {N_SAMPLES} samples from MS MARCO...")
21
+ raw_texts = load_ms_marco(N_SAMPLES)
22
+
23
+ # 2. Generate Embeddings (MiniLM-L6-v2)
24
+ MODEL_NAME = EMBEDDING_MODELS["minilm"]
25
+ print(f"Generating embeddings using {MODEL_NAME}...")
26
+ embeddings = get_embeddings(MODEL_NAME, raw_texts)
27
+
28
+ # 3. Train & Save Routers
29
+ models_to_train = [
30
+ ("logistic", "router_logistic.pkl"),
31
+ ("lightgbm", "router_lightgbm.pkl"),
32
+ ("mlp", "router_mlp.pkl")
33
+ ]
34
+
35
+ for model_type, filename in models_to_train:
36
+ print(f"\n--- Training {model_type.upper()} Router ---")
37
+ router = LearnedRouter(model_type=model_type, n_clusters=NUM_CLUSTERS, mrl_dims=MRL_DIMS)
38
+ router.train(embeddings)
39
+
40
+ save_path = os.path.abspath(f"models/{filename}")
41
+ print(f"Saving to {save_path}...")
42
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
43
+ router.save(save_path)
44
+ print(f"{model_type.upper()} Router saved.")
45
+
46
+ print("\n>>> All Routers Trained & Saved!")
47
+
48
+ if __name__ == "__main__":
49
+ train_routers()
src/vector_db.py CHANGED
@@ -104,10 +104,13 @@ class UnifiedQdrant:
104
 
105
  def index_data(self, vectors: np.ndarray, payloads: List[Dict[str, Any]], cluster_ids: List[Optional[int]] = None):
106
  """
107
- Indexes data.
108
  If cluster_ids provided, uses custom sharding (Prod).
109
  If cluster_ids is None, uses standard upsert (Baseline/Local).
 
110
  """
 
 
111
  if cluster_ids is None or self.is_local:
112
  # Standard Upsert
113
  points = [
@@ -117,11 +120,16 @@ class UnifiedQdrant:
117
  payload=payloads[i]
118
  ) for i, vec in enumerate(vectors)
119
  ]
120
- # Batching is better, but for simplicity:
121
- self.client.upsert(
122
- collection_name=self.collection_name,
123
- points=points
124
- )
 
 
 
 
 
125
  return
126
 
127
  # Custom Sharding Upsert
@@ -141,13 +149,19 @@ class UnifiedQdrant:
141
  )
142
  )
143
 
144
- print(f"Indexing data across {len(data_by_shard)} shards...")
145
- for key, batch_points in data_by_shard.items():
146
- self.client.upsert(
147
- collection_name=self.collection_name,
148
- points=batch_points,
149
- shard_key_selector=key
150
- )
 
 
 
 
 
 
151
 
152
  def search_hybrid(self, query_vec: np.ndarray, target_clusters: List[int], confidence: float) -> List[Any]:
153
  """
 
104
 
105
  def index_data(self, vectors: np.ndarray, payloads: List[Dict[str, Any]], cluster_ids: List[Optional[int]] = None):
106
  """
107
+ Indexes data with batching to avoid payload limits.
108
  If cluster_ids provided, uses custom sharding (Prod).
109
  If cluster_ids is None, uses standard upsert (Baseline/Local).
110
+ BATCH_SIZE hardcoded to 500 for safety.
111
  """
112
+ BATCH_SIZE = 500
113
+
114
  if cluster_ids is None or self.is_local:
115
  # Standard Upsert
116
  points = [
 
120
  payload=payloads[i]
121
  ) for i, vec in enumerate(vectors)
122
  ]
123
+
124
+ # Batching
125
+ total = len(points)
126
+ print(f"Upserting {total} points to '{self.collection_name}' (Standard)...")
127
+ for i in range(0, total, BATCH_SIZE):
128
+ batch = points[i : i + BATCH_SIZE]
129
+ self.client.upsert(
130
+ collection_name=self.collection_name,
131
+ points=batch
132
+ )
133
  return
134
 
135
  # Custom Sharding Upsert
 
149
  )
150
  )
151
 
152
+ print(f"Indexing data across {len(data_by_shard)} shards (Custom Sharded)...")
153
+ for key, shard_points in data_by_shard.items():
154
+ # Also batch per shard if needed (though unlikely to exceed 32MB per shard with 25k samples)
155
+ # 25k samples / 32 shards ~= 800 points per shard. 800 * 8KB << 32MB.
156
+ # But safe is safe.
157
+ total_shard = len(shard_points)
158
+ for i in range(0, total_shard, BATCH_SIZE):
159
+ batch = shard_points[i : i + BATCH_SIZE]
160
+ self.client.upsert(
161
+ collection_name=self.collection_name,
162
+ points=batch,
163
+ shard_key_selector=key
164
+ )
165
 
166
  def search_hybrid(self, query_vec: np.ndarray, target_clusters: List[int], confidence: float) -> List[Any]:
167
  """