Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Regenerate embeddings using semantic sentence-transformers model. | |
| Purpose: | |
| Completely regenerates embeddings using the semantic sentence-transformers model, | |
| creates a new Pinecone index, and uploads the embeddings. This is a full refresh | |
| of the vector database with semantic embeddings. | |
| Process: | |
| 1. Loads documents and chunks them | |
| 2. Generates semantic embeddings (384-dim using all-MiniLM-L6-v2) | |
| 3. Saves to data/chunks_semantic.jsonl | |
| 4. Creates new Pinecone index with 384 dimensions | |
| 5. Uploads semantic embeddings to new index | |
| Inputs: | |
| None (uses sample_docs directory by default) | |
| PINECONE_API_KEY environment variable | |
| Outputs: | |
| Saves embedded chunks to data/chunks_semantic.jsonl | |
| Creates and populates new Pinecone index | |
| Prints progress and completion messages | |
| Environment variables required: | |
| PINECONE_API_KEY: Your Pinecone API key | |
| Usage: | |
| python scripts/regenerate_with_semantic.py | |
| """ | |
| import sys | |
| import os | |
| from pathlib import Path | |
| # Add project root to path | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from src.ingestion.load_docs import load_markdown_docs | |
| from src.ingestion.chunker import chunk_documents | |
| from src.ingestion.embeddings import batch_embed_chunks, get_embedding | |
| from pinecone import Pinecone, ServerlessSpec | |
| import src.config as cfg | |
| import json | |
| def main(): | |
| print("=" * 60) | |
| print("Regenerating Embeddings with Semantic Model") | |
| print("=" * 60) | |
| # Step 1: Load and chunk documents | |
| print("\n[1/5] Loading documents...") | |
| docs_dir = str(PROJECT_ROOT / "sample_docs") | |
| docs = load_markdown_docs(docs_dir) | |
| print(f" Loaded {len(docs)} documents") | |
| print("\n[2/5] Chunking documents...") | |
| chunks = chunk_documents(docs, max_tokens=300, overlap=50) | |
| print(f" Generated {len(chunks)} chunks") | |
| # Step 2: Generate semantic embeddings | |
| print("\n[3/5] Generating semantic embeddings...") | |
| print(" Using model: all-MiniLM-L6-v2 (384 dimensions)") | |
| print(" This may take 1-2 minutes...") | |
| embedded = batch_embed_chunks( | |
| chunks, | |
| provider="sentence-transformers", | |
| model_name="all-MiniLM-L6-v2" | |
| ) | |
| # Get actual dimension from first embedding | |
| actual_dim = len(embedded[0]['embedding']) | |
| print(f" β Generated {len(embedded)} embeddings ({actual_dim} dimensions)") | |
| # Step 3: Save to file | |
| print("\n[4/5] Saving embeddings...") | |
| output_file = PROJECT_ROOT / "data" / "chunks_semantic.jsonl" | |
| output_file.parent.mkdir(parents=True, exist_ok=True) | |
| with output_file.open("w", encoding="utf-8") as f: | |
| for i, e in enumerate(embedded): | |
| # Merge text back from chunks | |
| chunk_text = chunks[i]["text"] | |
| obj = { | |
| "id": f"{e['filename']}::{e['chunk_id']}", | |
| "filename": e["filename"], | |
| "chunk_id": e["chunk_id"], | |
| "text": chunk_text, | |
| "chars": e.get("chars", 0), | |
| "embedding": e["embedding"] | |
| } | |
| f.write(json.dumps(obj, ensure_ascii=False) + "\n") | |
| print(f" β Saved to: {output_file}") | |
| # Step 4: Create new Pinecone index | |
| print("\n[5/5] Setting up Pinecone index...") | |
| print(f" Connecting to Pinecone...") | |
| pc = Pinecone(api_key=cfg.PINECONE_API_KEY) | |
| new_index_name = "rag-semantic-384" | |
| print(f" Creating new index: {new_index_name}") | |
| print(f" Dimension: {actual_dim}, Metric: cosine") | |
| # Check if index exists | |
| existing_indexes = [idx.name for idx in pc.list_indexes()] | |
| if new_index_name in existing_indexes: | |
| print(f" Index '{new_index_name}' already exists - deleting old version...") | |
| pc.delete_index(new_index_name) | |
| # Create new index | |
| pc.create_index( | |
| name=new_index_name, | |
| dimension=actual_dim, | |
| metric="cosine", | |
| spec=ServerlessSpec(cloud="aws", region="us-east-1") | |
| ) | |
| print(f" β Index created") | |
| # Wait for index to be ready | |
| print(" Waiting for index to be ready...") | |
| import time | |
| while not pc.describe_index(new_index_name).status.ready: | |
| time.sleep(1) | |
| # Step 5: Upload to Pinecone | |
| print(f"\n Uploading {len(embedded)} vectors to Pinecone...") | |
| index = pc.Index(new_index_name) | |
| # Prepare vectors for upsert | |
| vectors = [] | |
| for e in embedded: | |
| vec_id = f"{e['filename']}::{e['chunk_id']}" | |
| vectors.append({ | |
| "id": vec_id, | |
| "values": e["embedding"], | |
| "metadata": {} | |
| }) | |
| # Upsert in batches of 100 | |
| batch_size = 100 | |
| for i in range(0, len(vectors), batch_size): | |
| batch = vectors[i:i+batch_size] | |
| index.upsert(vectors=batch) | |
| print(f" Uploaded {min(i+batch_size, len(vectors))}/{len(vectors)} vectors") | |
| # Verify upload | |
| stats = index.describe_index_stats() | |
| print(f" β Index now contains {stats.total_vector_count} vectors") | |
| print("\n" + "=" * 60) | |
| print("β COMPLETE!") | |
| print("=" * 60) | |
| print(f"\nNext steps:") | |
| print(f"1. Update config: export PINECONE_INDEX_NAME='{new_index_name}'") | |
| print(f"2. Test search: python -c \"from src.retrieval.retriever import query_pinecone; print(query_pinecone('what is GDPR', top_k=5))\"") | |
| print() | |
| if __name__ == "__main__": | |
| main() |