Spaces:
Sleeping
Sleeping
| import os | |
| import pymongo | |
| import pickle | |
| from dotenv import load_dotenv | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| from datetime import datetime | |
| # ----------------------------- | |
| # 1. Load environment variables | |
| # ----------------------------- | |
| load_dotenv() | |
| MONGO_URI = os.getenv("MONGO_URI") # Mongo connection | |
| DB_NAME = os.getenv("DB_NAME", "company_chatbot") | |
| COLLECTION_NAME = os.getenv("COLLECTION_NAME", "processed_data") | |
| VECTORSTORE_DIR = "../vectorstore" | |
| os.makedirs(VECTORSTORE_DIR, exist_ok=True) | |
| # ----------------------------- | |
| # 2. Connect to MongoDB | |
| # ----------------------------- | |
| client = pymongo.MongoClient(MONGO_URI) | |
| db = client[DB_NAME] | |
| collection = db[COLLECTION_NAME] | |
| # ----------------------------- | |
| # 3. Preprocessing helpers | |
| # ----------------------------- | |
| def clean_string(val): | |
| if val is None: | |
| return "" | |
| return str(val).replace('"', '').replace("'", '').strip() | |
| def clean_numeric(val): | |
| try: | |
| return float(val) | |
| except: | |
| return None | |
| def preprocess_document(doc): | |
| """Convert MongoDB doc to LangChain Document with full structured metadata""" | |
| project_name = clean_string(doc.get("projectName")) | |
| project_type = clean_string(doc.get("projectType")) | |
| project_category = clean_string(doc.get("projectCategory")) | |
| slug = clean_string(doc.get("slug")) | |
| status = clean_string(doc.get("status")) | |
| bhk = clean_string(doc.get("type") or doc.get("customBHK")) | |
| price = clean_numeric(doc.get("price")) | |
| carpet_area = clean_numeric(doc.get("carpetArea")) | |
| bathrooms = clean_numeric(doc.get("bathrooms")) | |
| balcony = clean_numeric(doc.get("balcony")) | |
| furnished = clean_string(doc.get("furnishedType")) | |
| lift = doc.get("lift", False) | |
| possession_date = clean_string(doc.get("possessionDate")) | |
| amenities = clean_string(doc.get("aboutProperty")) | |
| address = clean_string(doc.get("Address info")) | |
| # Extract city/locality from slug (fallback if missing) | |
| parts = slug.split("-") if slug else [] | |
| locality = parts[-3].capitalize() if len(parts) >= 3 else "" | |
| city = parts[-2].capitalize() if len(parts) >= 2 else "" | |
| # Page content for embeddings (can include any text you want LLM to use) | |
| content = f""" | |
| Project Name: {project_name} | |
| Type: {bhk} | |
| Status: {status} | |
| Price: {price} | |
| Carpet Area: {carpet_area} | |
| Bathrooms: {bathrooms} | |
| Balcony: {balcony} | |
| Furnishing: {furnished} | |
| Lift: {lift} | |
| Location: {locality}, {city} | |
| Address: {address} | |
| Amenities: {amenities} | |
| """ | |
| # Structured metadata | |
| metadata = { | |
| "id": str(doc.get("_id")), | |
| "slug": slug, | |
| "projectName": project_name, | |
| "projectType": project_type, | |
| "projectCategory": project_category, | |
| "status": status, | |
| "BHK": bhk, | |
| "price": price, | |
| "price_in_cr": round(price / 10000000, 2) if price else None, | |
| "carpetArea": carpet_area, | |
| "bathrooms": bathrooms, | |
| "balcony": balcony, | |
| "furnishedType": furnished, | |
| "lift": lift, | |
| "possessionDate": possession_date, | |
| "city": city, | |
| "locality": locality, | |
| "address": address, | |
| "amenities": amenities, | |
| "createdAt": doc.get("createdAt"), | |
| "updatedAt": doc.get("updatedAt") | |
| } | |
| return Document(page_content=" ".join(content.split()), metadata=metadata) | |
| # ----------------------------- | |
| # 4. Fetch & preprocess all docs | |
| # ----------------------------- | |
| raw_docs = list(collection.find({})) | |
| documents = [preprocess_document(doc) for doc in raw_docs] | |
| print(f"Fetched {len(documents)} documents from MongoDB") | |
| # ----------------------------- | |
| # 5. Chunk documents | |
| # ----------------------------- | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50) | |
| docs = text_splitter.split_documents(documents) | |
| print(f"After chunking → {len(docs)} chunks") | |
| # ----------------------------- | |
| # 6. Generate embeddings | |
| # ----------------------------- | |
| embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| vectorstore = FAISS.from_documents(docs, embedding_model) | |
| # ----------------------------- | |
| # 7. Save FAISS index & metadata separately | |
| # ----------------------------- | |
| # Save FAISS vectorstore (index + metadata) into the folder | |
| vectorstore.save_local(VECTORSTORE_DIR) | |
| print(f"✅ FAISS vectors and metadata saved in {VECTORSTORE_DIR}") | |