Fix README accuracy and track Kaggle notebook
Browse files- .gitignore +1 -0
- README.md +1 -1
- scripts/kaggle_pipeline.ipynb +1 -0
.gitignore
CHANGED
|
@@ -31,6 +31,7 @@ assets/
|
|
| 31 |
# Jupyter
|
| 32 |
.ipynb_checkpoints/
|
| 33 |
*.ipynb
|
|
|
|
| 34 |
|
| 35 |
# Build
|
| 36 |
*.egg-info/
|
|
|
|
| 31 |
# Jupyter
|
| 32 |
.ipynb_checkpoints/
|
| 33 |
*.ipynb
|
| 34 |
+
!scripts/kaggle_pipeline.ipynb
|
| 35 |
|
| 36 |
# Build
|
| 37 |
*.egg-info/
|
README.md
CHANGED
|
@@ -76,7 +76,7 @@ User Query: "wireless earbuds for running"
|
|
| 76 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
```
|
| 78 |
|
| 79 |
-
**Data flow:** 1M Amazon reviews β 5-core filter β 334K reviews β semantic chunking β 423K chunks in Qdrant. *([pipeline.py](scripts/pipeline.py))*
|
| 80 |
|
| 81 |
---
|
| 82 |
|
|
|
|
| 76 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
```
|
| 78 |
|
| 79 |
+
**Data flow:** 1M Amazon reviews β 5-core filter β 334K reviews β semantic chunking β 423K chunks in Qdrant. *([pipeline.py](scripts/pipeline.py) | [Kaggle notebook](scripts/kaggle_pipeline.ipynb))*
|
| 80 |
|
| 81 |
---
|
| 82 |
|
scripts/kaggle_pipeline.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":""},"jupytext":{"cell_metadata_filter":"-all","main_language":"python","notebook_metadata_filter":"-all"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":14772377,"sourceType":"datasetVersion","datasetId":9442603}],"dockerImageVersionId":31260,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"15d0060c","cell_type":"markdown","source":"# Sage: Kaggle GPU Pipeline\n\nRuns the full data pipeline on Kaggle with 1M reviews using GPU acceleration.\nUploads embeddings to Qdrant Cloud.\n\n**Setup:**\n1. Enable GPU (Settings -> Accelerator -> GPU T4 x2)\n2. Add secrets: `QDRANT_URL`, `QDRANT_API_KEY`\n3. Run all cells","metadata":{}},{"id":"7f7d648c","cell_type":"markdown","source":"## Environment Setup","metadata":{}},{"id":"bf7a70a5","cell_type":"code","source":"import os\nimport sys\nimport time\nfrom pathlib import Path\n\nIS_KAGGLE = \"KAGGLE_KERNEL_RUN_TYPE\" in os.environ\n\nif IS_KAGGLE:\n # Add sage package from Kaggle dataset\n sys.path.insert(0, \"/kaggle/input/sage-package\")\n\n # Override data directory (Kaggle input is read-only)\n os.environ[\"SAGE_DATA_DIR\"] = \"/kaggle/working/data\"\n\n import subprocess\n\n packages = [\"qdrant-client>=1.7.0\", \"sentence-transformers>=2.2.0\"]\n for pkg in packages:\n subprocess.check_call(\n [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", pkg],\n stdout=subprocess.DEVNULL,\n )\n print(\"Packages installed\")\n\n from kaggle_secrets import UserSecretsClient\n\n secrets = UserSecretsClient()\n os.environ[\"QDRANT_URL\"] = secrets.get_secret(\"QDRANT_URL\")\n os.environ[\"QDRANT_API_KEY\"] = secrets.get_secret(\"QDRANT_API_KEY\")\n print(\"Secrets loaded\")\nelse:\n from dotenv import load_dotenv\n\n load_dotenv()\n print(\"Using local .env\")\n\nprint(f\"QDRANT_URL: {os.environ.get('QDRANT_URL', 'NOT SET')[:40]}...\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T11:56:22.760900Z","iopub.execute_input":"2026-02-09T11:56:22.761180Z","iopub.status.idle":"2026-02-09T11:56:30.616328Z","shell.execute_reply.started":"2026-02-09T11:56:22.761154Z","shell.execute_reply":"2026-02-09T11:56:30.615612Z"}},"outputs":[{"name":"stdout","text":"Packages installed\nSecrets loaded\nQDRANT_URL: https://2e48f44e-d660-42d6-b0ca-00be9317...\n","output_type":"stream"}],"execution_count":1},{"id":"08a06a38","cell_type":"markdown","source":"## Check GPU","metadata":{}},{"id":"5cc7d3e1","cell_type":"code","source":"import torch\n\nif torch.cuda.is_available():\n gpu_name = torch.cuda.get_device_name(0)\n gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9\n print(f\"GPU: {gpu_name} ({gpu_mem:.1f} GB)\")\nelse:\n print(\"WARNING: No GPU detected, embeddings will be slow\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T11:56:30.617478Z","iopub.execute_input":"2026-02-09T11:56:30.617707Z","iopub.status.idle":"2026-02-09T11:56:33.982077Z","shell.execute_reply.started":"2026-02-09T11:56:30.617684Z","shell.execute_reply":"2026-02-09T11:56:33.981202Z"}},"outputs":[{"name":"stdout","text":"GPU: Tesla T4 (15.6 GB)\n","output_type":"stream"}],"execution_count":2},{"id":"54826e9b","cell_type":"markdown","source":"## Load and Filter Data","metadata":{}},{"id":"c758ee99","cell_type":"code","source":"from sage.data import prepare_data, get_review_stats\n\nSUBSET_SIZE = 1_000_000 if IS_KAGGLE else 100_000\n\nprint(f\"Loading {SUBSET_SIZE:,} reviews...\")\nstart = time.time()\ndf = prepare_data(subset_size=SUBSET_SIZE, force=True)\nprint(f\"Prepared {len(df):,} reviews in {time.time() - start:.1f}s\")\n\nstats = get_review_stats(df)\nprint(f\" Users: {stats['unique_users']:,}\")\nprint(f\" Items: {stats['unique_items']:,}\")\nprint(f\" Sparsity: {stats['sparsity']:.4f}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T11:56:33.983230Z","iopub.execute_input":"2026-02-09T11:56:33.983762Z","iopub.status.idle":"2026-02-09T11:57:13.115525Z","shell.execute_reply.started":"2026-02-09T11:56:33.983727Z","shell.execute_reply":"2026-02-09T11:57:13.114818Z"}},"outputs":[{"name":"stdout","text":"11:56:35 INFO NumExpr defaulting to 4 threads.\nLoading 1,000,000 reviews...\n11:56:35 INFO Preparing data from scratch...\n11:56:35 INFO Streaming from https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/review_categories/Electronics.jsonl\n","output_type":"stream"},{"name":"stderr","text":"Loading reviews: 100%|ββββββββββ| 1000000/1000000 [00:22<00:00, 43896.55it/s]","output_type":"stream"},{"name":"stdout","text":"11:56:58 INFO Loaded 1,000,000 reviews\n","output_type":"stream"},{"name":"stderr","text":"\n","output_type":"stream"},{"name":"stdout","text":"11:57:04 INFO Cached to /kaggle/working/data/reviews_1000000.parquet\n11:57:04 INFO Cleaning data quality issues...\n11:57:07 INFO Cleaned: removed 34,099 reviews (3.4%)\n11:57:07 INFO Remaining: 965,901 reviews\n11:57:07 INFO Applying 5-core filtering...\n11:57:11 INFO Final prepared dataset: 334,282 reviews\n11:57:12 INFO Cached prepared data to: /kaggle/working/data/reviews_prepared_1000000.parquet\nPrepared 334,282 reviews in 37.0s\n Users: 31,455\n Items: 21,827\n Sparsity: 0.9995\n","output_type":"stream"}],"execution_count":3},{"id":"caf3dbd9","cell_type":"markdown","source":"## Chunk Reviews","metadata":{}},{"id":"d9f77b3e","cell_type":"code","source":"from sage.adapters.embeddings import get_embedder\nfrom sage.core.chunking import chunk_reviews_batch\n\n# Prepare reviews for chunking\nreviews = df.to_dict(\"records\")\nfor i, review in enumerate(reviews):\n review[\"review_id\"] = f\"review_{i}\"\n review[\"product_id\"] = review.get(\"parent_asin\", review.get(\"asin\", \"\"))\n\nprint(\"Loading E5-small embedding model...\")\nembedder = get_embedder()\n\nprint(f\"Chunking {len(reviews):,} reviews...\")\nstart = time.time()\nchunks = chunk_reviews_batch(reviews, embedder=embedder)\nprint(f\"Created {len(chunks):,} chunks in {time.time() - start:.1f}s\")\nprint(f\"Expansion ratio: {len(chunks) / len(reviews):.2f}x\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T11:57:13.117017Z","iopub.execute_input":"2026-02-09T11:57:13.117264Z","iopub.status.idle":"2026-02-09T12:08:14.818571Z","shell.execute_reply.started":"2026-02-09T11:57:13.117243Z","shell.execute_reply":"2026-02-09T12:08:14.817741Z"}},"outputs":[{"name":"stdout","text":"Loading E5-small embedding model...\n","output_type":"stream"},{"name":"stderr","text":"2026-02-09 11:57:27.687967: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\nWARNING: All log messages before absl::InitializeLog() is called are written to STDERR\nE0000 00:00:1770638247.841494 55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\nE0000 00:00:1770638247.887574 55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\nW0000 00:00:1770638248.253696 55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1770638248.253731 55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1770638248.253738 55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1770638248.253741 55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n","output_type":"stream"},{"name":"stdout","text":"11:57:40 INFO TensorFlow version 2.19.0 available.\n11:57:40 INFO JAX version 0.7.2 available.\n11:57:44 INFO Loading embedding model: intfloat/e5-small-v2\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"modules.json: 0%| | 0.00/387 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f80d236b6c4649de9c7bdaa881a29400"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"README.md: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"60ca7619a5654b00a73335d688990973"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"sentence_bert_config.json: 0%| | 0.00/57.0 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d0713502cbfb462c8831d91502246259"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"config.json: 0%| | 0.00/615 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"aa8c4b35bba946b6a14cde1cccd62a46"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model.safetensors: 0%| | 0.00/133M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"aff68a507afd4f7a92817629332b7a48"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json: 0%| | 0.00/314 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a3298dde565d4dfb833f16fad3d9f6d7"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"vocab.txt: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b4a995a43a5440feb8493aec37429709"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer.json: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"2f284f683f564fe28a7930ec2a0bb0c4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"special_tokens_map.json: 0%| | 0.00/125 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a07c605024114596a1259d6209873dcd"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"config.json: 0%| | 0.00/200 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"074815346c67429a9d9e35abf29224db"}},"metadata":{}},{"name":"stdout","text":"Chunking 334,282 reviews...\nCreated 423,165 chunks in 627.0s\nExpansion ratio: 1.27x\n","output_type":"stream"}],"execution_count":4},{"id":"b044ce5f","cell_type":"markdown","source":"## Generate Embeddings (GPU)","metadata":{}},{"id":"68866603","cell_type":"code","source":"import numpy as np\n\nchunk_texts = [c.text for c in chunks]\n\ncache_dir = Path(\"/kaggle/working\") if IS_KAGGLE else Path(\"data\")\ncache_dir.mkdir(exist_ok=True)\ncache_path = cache_dir / f\"embeddings_{len(chunks)}.npy\"\n\nprint(f\"Embedding {len(chunks):,} chunks...\")\nstart = time.time()\nembeddings = embedder.embed_passages(\n chunk_texts,\n cache_path=cache_path,\n force=True,\n batch_size=64,\n)\nembed_time = time.time() - start\n\nprint(f\"Embeddings: {embeddings.shape} in {embed_time:.1f}s\")\nprint(f\"Throughput: {len(chunks) / embed_time:.0f} chunks/sec\")\n\n# Validate\nassert embeddings.shape[1] == 384, f\"Wrong dims: {embeddings.shape[1]}\"\nassert np.isnan(embeddings).sum() == 0, \"NaN values\"\nnorms = np.linalg.norm(embeddings, axis=1)\nassert np.allclose(norms, 1.0, atol=0.01), \"Not normalized\"\nprint(\"Validation: PASSED\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T12:08:14.819848Z","iopub.execute_input":"2026-02-09T12:08:14.820261Z","iopub.status.idle":"2026-02-09T12:20:06.482254Z","shell.execute_reply.started":"2026-02-09T12:08:14.820219Z","shell.execute_reply":"2026-02-09T12:20:06.481520Z"}},"outputs":[{"name":"stdout","text":"Embedding 423,165 chunks...\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Batches: 0%| | 0/6612 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"99209a8191974a46b73baf2e0183aee3"}},"metadata":{}},{"name":"stdout","text":"12:20:06 INFO Embeddings cached to: /kaggle/working/embeddings_423165.npy\nEmbeddings: (423165, 384) in 711.2s\nThroughput: 595 chunks/sec\nValidation: PASSED\n","output_type":"stream"}],"execution_count":5},{"id":"919a6b46","cell_type":"markdown","source":"## Upload to Qdrant Cloud","metadata":{}},{"id":"9ce8f790","cell_type":"code","source":"from sage.adapters.vector_store import (\n get_client,\n create_collection,\n upload_chunks,\n get_collection_info,\n create_payload_indexes,\n)\n\nqdrant_url = os.environ.get(\"QDRANT_URL\")\nprint(f\"Uploading to: {qdrant_url[:40]}...\")\n\nclient = get_client()\ncreate_collection(client)\n\nstart = time.time()\nupload_chunks(client, chunks, embeddings)\nprint(f\"Upload complete in {time.time() - start:.1f}s\")\n\ncreate_payload_indexes(client)\n\ninfo = get_collection_info(client)\nprint(\"\\nCollection info:\")\nfor key, value in info.items():\n print(f\" {key}: {value}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T12:20:06.483383Z","iopub.execute_input":"2026-02-09T12:20:06.483679Z","iopub.status.idle":"2026-02-09T12:27:38.187681Z","shell.execute_reply.started":"2026-02-09T12:20:06.483656Z","shell.execute_reply":"2026-02-09T12:27:38.186982Z"}},"outputs":[{"name":"stdout","text":"Uploading to: https://2e48f44e-d660-42d6-b0ca-00be9317...\n12:20:08 INFO Deleting existing collection: sage_reviews\n12:20:08 INFO Creating collection: sage_reviews\n","output_type":"stream"},{"name":"stderr","text":"Uploading to Qdrant: 100%|ββββββββββ| 4232/4232 [06:59<00:00, 10.08it/s]","output_type":"stream"},{"name":"stdout","text":"12:27:25 INFO Uploaded 423165 points to sage_reviews\nUpload complete in 439.0s\n12:27:27 INFO Creating payload indexes...\n","output_type":"stream"},{"name":"stderr","text":"\n","output_type":"stream"},{"name":"stdout","text":"12:27:38 INFO Indexes created for: rating, product_id, timestamp\n\nCollection info:\n name: sage_reviews\n points_count: 423165\n status: yellow\n","output_type":"stream"}],"execution_count":6},{"id":"9a419bd5","cell_type":"markdown","source":"## Test Search","metadata":{}},{"id":"67af7b69","cell_type":"code","source":"from sage.adapters.vector_store import search\n\nquery = \"wireless headphones with noise cancellation\"\nquery_emb = embedder.embed_single_query(query)\nresults = search(client, query_emb.tolist(), limit=5)\n\nprint(f\"Query: '{query}'\\n\")\nfor i, r in enumerate(results):\n print(f\"{i + 1}. [{r['rating']:.0f}*] {r['text'][:70]}...\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T12:28:22.406633Z","iopub.execute_input":"2026-02-09T12:28:22.406936Z","iopub.status.idle":"2026-02-09T12:28:22.646372Z","shell.execute_reply.started":"2026-02-09T12:28:22.406909Z","shell.execute_reply":"2026-02-09T12:28:22.645698Z"}},"outputs":[{"name":"stdout","text":"Query: 'wireless headphones with noise cancellation'\n\n1. [5*] These are the best noise cancellation, wireless headphones on the mark...\n2. [5*] These seem to be good wireless noise cancelling headphones. I have be...\n3. [4*] Sony Noise Cancelling Headphones WHCH710N: Wireless Bluetooth Over The...\n4. [5*] JBL T600BTNC Noise Cancelling, On-Ear, Wireless Bluetooth Headphones....\n5. [5*] Best Bluetooth headphones set with noise cancellation. Very comfortabl...\n","output_type":"stream"}],"execution_count":8},{"id":"7d2e4a89","cell_type":"code","source":"client.close()\nprint(f\"\\nDone! {info.get('points_count', len(chunks)):,} chunks indexed to Qdrant Cloud\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-02-09T12:28:25.991175Z","iopub.execute_input":"2026-02-09T12:28:25.991571Z","iopub.status.idle":"2026-02-09T12:28:25.996167Z","shell.execute_reply.started":"2026-02-09T12:28:25.991532Z","shell.execute_reply":"2026-02-09T12:28:25.995407Z"}},"outputs":[{"name":"stdout","text":"\nDone! 423,165 chunks indexed to Qdrant Cloud\n","output_type":"stream"}],"execution_count":9}]}
|