Spaces:
Sleeping
Sleeping
File size: 2,320 Bytes
de526bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import time
from urllib.parse import quote_plus
from pymongo import MongoClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import certifi
# --- 1. MongoDB Connection ---
USERNAME = "prarabdhapandey696_db_user"
PASSWORD = "Golu123Golu"
CLUSTER_URL = "constitutionbotcluster.d4pdxfq.mongodb.net"
encoded_password = quote_plus(PASSWORD)
connection_string = f"mongodb+srv://{USERNAME}:{encoded_password}@{CLUSTER_URL}/?retryWrites=true&w=majority"
ca = certifi.where()
client = MongoClient(connection_string, tlsCAFile=ca)
db = client.constitution_db
source_collection = db.articles
dest_collection = db.vectors
print("✅ Connected to MongoDB!")
# --- 2. Initialize Models ---
print("🧠 Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded.")
# --- 3. Chunking and Embedding Pipeline ---
try:
dest_collection.delete_many({})
print("Cleared existing data from destination collection.")
articles = list(source_collection.find())
print(f"Found {len(articles)} articles to process.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
for article in articles:
# CORRECTED PART 1: Check for 'article_desc' instead of 'Text'
if 'article_desc' in article and article['article_desc']:
# CORRECTED PART 2: Get the text from the 'article_desc' field
chunks = text_splitter.split_text(article['article_desc'])
embeddings = embedding_model.encode(chunks)
docs_to_insert = []
for i, chunk in enumerate(chunks):
new_doc = {
# CORRECTED PART 3: Use 'article_id' as the source identifier
"source_title": f"Article {article.get('article_id')}",
"text_chunk": chunk,
"embedding": embeddings[i].tolist()
}
docs_to_insert.append(new_doc)
if docs_to_insert:
dest_collection.insert_many(docs_to_insert)
print("\n🎉 Successfully chunked and embedded all documents!")
except Exception as e:
print(f"An error occurred: {e}")
finally:
client.close()
print("Connection closed.") |