constitutional-ai-bot / embed_data.py
Prarabdha pandey
Initial commit with clean project files
de526bd
import time
from urllib.parse import quote_plus
from pymongo import MongoClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import certifi
# --- 1. MongoDB Connection ---
USERNAME = "prarabdhapandey696_db_user"
PASSWORD = "Golu123Golu"
CLUSTER_URL = "constitutionbotcluster.d4pdxfq.mongodb.net"
encoded_password = quote_plus(PASSWORD)
connection_string = f"mongodb+srv://{USERNAME}:{encoded_password}@{CLUSTER_URL}/?retryWrites=true&w=majority"
ca = certifi.where()
client = MongoClient(connection_string, tlsCAFile=ca)
db = client.constitution_db
source_collection = db.articles
dest_collection = db.vectors
print("✅ Connected to MongoDB!")
# --- 2. Initialize Models ---
print("🧠 Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded.")
# --- 3. Chunking and Embedding Pipeline ---
try:
dest_collection.delete_many({})
print("Cleared existing data from destination collection.")
articles = list(source_collection.find())
print(f"Found {len(articles)} articles to process.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
for article in articles:
# CORRECTED PART 1: Check for 'article_desc' instead of 'Text'
if 'article_desc' in article and article['article_desc']:
# CORRECTED PART 2: Get the text from the 'article_desc' field
chunks = text_splitter.split_text(article['article_desc'])
embeddings = embedding_model.encode(chunks)
docs_to_insert = []
for i, chunk in enumerate(chunks):
new_doc = {
# CORRECTED PART 3: Use 'article_id' as the source identifier
"source_title": f"Article {article.get('article_id')}",
"text_chunk": chunk,
"embedding": embeddings[i].tolist()
}
docs_to_insert.append(new_doc)
if docs_to_insert:
dest_collection.insert_many(docs_to_insert)
print("\n🎉 Successfully chunked and embedded all documents!")
except Exception as e:
print(f"An error occurred: {e}")
finally:
client.close()
print("Connection closed.")