Spaces:

vxa8502
/

Sage

Running

App Files Files Community

vxa8502 commited on Mar 8

Commit

26ca7c1

1 Parent(s): a68e765

Improve validation and resource handling

Browse files

Files changed (1) hide show

scripts/kaggle_pipeline.py +47 -40

scripts/kaggle_pipeline.py CHANGED Viewed

@@ -30,7 +30,8 @@ if IS_KAGGLE:
     import subprocess
-    packages = ["qdrant-client>=1.7.0", "sentence-transformers>=2.2.0"]
     for pkg in packages:
         subprocess.check_call(
             [sys.executable, "-m", "pip", "install", "-q", pkg],
@@ -50,7 +51,7 @@ else:
     load_dotenv()
     print("Using local .env")
-print(f"QDRANT_URL: {os.environ.get('QDRANT_URL', 'NOT SET')[:40]}...")
 # %% [markdown]
 # ## Check GPU
@@ -75,6 +76,7 @@ SUBSET_SIZE = 1_000_000 if IS_KAGGLE else 100_000
 print(f"Loading {SUBSET_SIZE:,} reviews...")
 start = time.time()
 df = prepare_data(subset_size=SUBSET_SIZE, force=True)
 print(f"Prepared {len(df):,} reviews in {time.time() - start:.1f}s")
@@ -111,6 +113,8 @@ print(f"Expansion ratio: {len(chunks) / len(reviews):.2f}x")
 # %%
 import numpy as np
 chunk_texts = [c.text for c in chunks]
 cache_dir = Path("/kaggle/working") if IS_KAGGLE else Path("data")
@@ -130,11 +134,16 @@ embed_time = time.time() - start
 print(f"Embeddings: {embeddings.shape} in {embed_time:.1f}s")
 print(f"Throughput: {len(chunks) / embed_time:.0f} chunks/sec")
-# Validate
-assert embeddings.shape[1] == 384, f"Wrong dims: {embeddings.shape[1]}"
-assert np.isnan(embeddings).sum() == 0, "NaN values"
 norms = np.linalg.norm(embeddings, axis=1)
-assert np.allclose(norms, 1.0, atol=0.01), "Not normalized"
 print("Validation: PASSED")
 # %% [markdown]
@@ -147,40 +156,38 @@ from sage.adapters.vector_store import (
     upload_chunks,
     get_collection_info,
     create_payload_indexes,
 )
-qdrant_url = os.environ.get("QDRANT_URL")
-print(f"Uploading to: {qdrant_url[:40]}...")
 client = get_client()
-create_collection(client)
-create_payload_indexes(client)
-start = time.time()
-upload_chunks(client, chunks, embeddings)
-print(f"Upload complete in {time.time() - start:.1f}s")
-info = get_collection_info(client)
-print("\nCollection info:")
-for key, value in info.items():
-    print(f"  {key}: {value}")
-# %% [markdown]
-# ## Test Search
-# %%
-from sage.adapters.vector_store import search
-query = "wireless headphones with noise cancellation"
-query_emb = embedder.embed_single_query(query)
-results = search(client, query_emb.tolist(), limit=5)
-print(f"Query: '{query}'\n")
-for i, r in enumerate(results):
-    print(f"{i + 1}. [{r['rating']:.0f}*] {r['text'][:70]}...")
-# %%
-client.close()
-print(
-    f"\nDone! {info.get('points_count', len(chunks)):,} chunks indexed to Qdrant Cloud"
-)

     import subprocess
+    # Pin exact versions matching requirements.txt for reproducibility
+    packages = ["qdrant-client==1.12.1", "sentence-transformers==3.3.1"]
     for pkg in packages:
         subprocess.check_call(
             [sys.executable, "-m", "pip", "install", "-q", pkg],
     load_dotenv()
     print("Using local .env")
+print(f"QDRANT_URL: {'configured' if os.environ.get('QDRANT_URL') else 'NOT SET'}")
 # %% [markdown]
 # ## Check GPU
 print(f"Loading {SUBSET_SIZE:,} reviews...")
 start = time.time()
+# Kaggle kernels are ephemeral - no persistent cache between runs, always regenerate
 df = prepare_data(subset_size=SUBSET_SIZE, force=True)
 print(f"Prepared {len(df):,} reviews in {time.time() - start:.1f}s")
 # %%
 import numpy as np
+from sage.config import EMBEDDING_DIM
 chunk_texts = [c.text for c in chunks]
 cache_dir = Path("/kaggle/working") if IS_KAGGLE else Path("data")
 print(f"Embeddings: {embeddings.shape} in {embed_time:.1f}s")
 print(f"Throughput: {len(chunks) / embed_time:.0f} chunks/sec")
+# Validate embeddings (explicit checks instead of assert - survives python -O)
+if embeddings.shape[1] != EMBEDDING_DIM:
+    raise ValueError(
+        f"Wrong embedding dimensions: {embeddings.shape[1]}, expected {EMBEDDING_DIM}"
+    )
+if np.isnan(embeddings).any() or np.isinf(embeddings).any():
+    raise ValueError("Embeddings contain NaN or Inf values")
 norms = np.linalg.norm(embeddings, axis=1)
+if not np.allclose(norms, 1.0, atol=0.01):
+    raise ValueError("Embeddings are not normalized")
 print("Validation: PASSED")
 # %% [markdown]
     upload_chunks,
     get_collection_info,
     create_payload_indexes,
+    search,
 )
 client = get_client()
+try:
+    create_collection(client)
+    create_payload_indexes(client)
+    start = time.time()
+    upload_chunks(client, chunks, embeddings)
+    print(f"Upload complete in {time.time() - start:.1f}s")
+    info = get_collection_info(client)
+    print("\nCollection info:")
+    for key, value in info.items():
+        print(f"  {key}: {value}")
+    # %% [markdown]
+    # ## Test Search
+    # %%
+    query = "wireless headphones with noise cancellation"
+    query_emb = embedder.embed_single_query(query)
+    results = search(client, query_emb.tolist(), limit=5)
+    print(f"Query: '{query}'\n")
+    for i, r in enumerate(results):
+        print(f"{i + 1}. [{r['rating']:.0f}*] {r['text'][:70]}...")
+    # %%
+    print(
+        f"\nDone! {info.get('points_count', len(chunks)):,} chunks indexed to Qdrant Cloud"
+    )
+finally:
+    client.close()