RAG-document-assistant / scripts /regenerate_with_semantic.py
vn6295337's picture
Initial commit: RAG Document Assistant with Zero-Storage Privacy
f866820
#!/usr/bin/env python3
"""
Regenerate embeddings using semantic sentence-transformers model.
Purpose:
Completely regenerates embeddings using the semantic sentence-transformers model,
creates a new Pinecone index, and uploads the embeddings. This is a full refresh
of the vector database with semantic embeddings.
Process:
1. Loads documents and chunks them
2. Generates semantic embeddings (384-dim using all-MiniLM-L6-v2)
3. Saves to data/chunks_semantic.jsonl
4. Creates new Pinecone index with 384 dimensions
5. Uploads semantic embeddings to new index
Inputs:
None (uses sample_docs directory by default)
PINECONE_API_KEY environment variable
Outputs:
Saves embedded chunks to data/chunks_semantic.jsonl
Creates and populates new Pinecone index
Prints progress and completion messages
Environment variables required:
PINECONE_API_KEY: Your Pinecone API key
Usage:
python scripts/regenerate_with_semantic.py
"""
import sys
import os
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.ingestion.load_docs import load_markdown_docs
from src.ingestion.chunker import chunk_documents
from src.ingestion.embeddings import batch_embed_chunks, get_embedding
from pinecone import Pinecone, ServerlessSpec
import src.config as cfg
import json
def main():
print("=" * 60)
print("Regenerating Embeddings with Semantic Model")
print("=" * 60)
# Step 1: Load and chunk documents
print("\n[1/5] Loading documents...")
docs_dir = str(PROJECT_ROOT / "sample_docs")
docs = load_markdown_docs(docs_dir)
print(f" Loaded {len(docs)} documents")
print("\n[2/5] Chunking documents...")
chunks = chunk_documents(docs, max_tokens=300, overlap=50)
print(f" Generated {len(chunks)} chunks")
# Step 2: Generate semantic embeddings
print("\n[3/5] Generating semantic embeddings...")
print(" Using model: all-MiniLM-L6-v2 (384 dimensions)")
print(" This may take 1-2 minutes...")
embedded = batch_embed_chunks(
chunks,
provider="sentence-transformers",
model_name="all-MiniLM-L6-v2"
)
# Get actual dimension from first embedding
actual_dim = len(embedded[0]['embedding'])
print(f" βœ“ Generated {len(embedded)} embeddings ({actual_dim} dimensions)")
# Step 3: Save to file
print("\n[4/5] Saving embeddings...")
output_file = PROJECT_ROOT / "data" / "chunks_semantic.jsonl"
output_file.parent.mkdir(parents=True, exist_ok=True)
with output_file.open("w", encoding="utf-8") as f:
for i, e in enumerate(embedded):
# Merge text back from chunks
chunk_text = chunks[i]["text"]
obj = {
"id": f"{e['filename']}::{e['chunk_id']}",
"filename": e["filename"],
"chunk_id": e["chunk_id"],
"text": chunk_text,
"chars": e.get("chars", 0),
"embedding": e["embedding"]
}
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
print(f" βœ“ Saved to: {output_file}")
# Step 4: Create new Pinecone index
print("\n[5/5] Setting up Pinecone index...")
print(f" Connecting to Pinecone...")
pc = Pinecone(api_key=cfg.PINECONE_API_KEY)
new_index_name = "rag-semantic-384"
print(f" Creating new index: {new_index_name}")
print(f" Dimension: {actual_dim}, Metric: cosine")
# Check if index exists
existing_indexes = [idx.name for idx in pc.list_indexes()]
if new_index_name in existing_indexes:
print(f" Index '{new_index_name}' already exists - deleting old version...")
pc.delete_index(new_index_name)
# Create new index
pc.create_index(
name=new_index_name,
dimension=actual_dim,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
print(f" βœ“ Index created")
# Wait for index to be ready
print(" Waiting for index to be ready...")
import time
while not pc.describe_index(new_index_name).status.ready:
time.sleep(1)
# Step 5: Upload to Pinecone
print(f"\n Uploading {len(embedded)} vectors to Pinecone...")
index = pc.Index(new_index_name)
# Prepare vectors for upsert
vectors = []
for e in embedded:
vec_id = f"{e['filename']}::{e['chunk_id']}"
vectors.append({
"id": vec_id,
"values": e["embedding"],
"metadata": {}
})
# Upsert in batches of 100
batch_size = 100
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i+batch_size]
index.upsert(vectors=batch)
print(f" Uploaded {min(i+batch_size, len(vectors))}/{len(vectors)} vectors")
# Verify upload
stats = index.describe_index_stats()
print(f" βœ“ Index now contains {stats.total_vector_count} vectors")
print("\n" + "=" * 60)
print("βœ… COMPLETE!")
print("=" * 60)
print(f"\nNext steps:")
print(f"1. Update config: export PINECONE_INDEX_NAME='{new_index_name}'")
print(f"2. Test search: python -c \"from src.retrieval.retriever import query_pinecone; print(query_pinecone('what is GDPR', top_k=5))\"")
print()
if __name__ == "__main__":
main()