Spaces:
Sleeping
Sleeping
File size: 4,701 Bytes
8630e6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import pymongo
import pickle
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from datetime import datetime
# -----------------------------
# 1. Load environment variables
# -----------------------------
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI") # Mongo connection
DB_NAME = os.getenv("DB_NAME", "company_chatbot")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "processed_data")
VECTORSTORE_DIR = "../vectorstore"
os.makedirs(VECTORSTORE_DIR, exist_ok=True)
# -----------------------------
# 2. Connect to MongoDB
# -----------------------------
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]
# -----------------------------
# 3. Preprocessing helpers
# -----------------------------
def clean_string(val):
if val is None:
return ""
return str(val).replace('"', '').replace("'", '').strip()
def clean_numeric(val):
try:
return float(val)
except:
return None
def preprocess_document(doc):
"""Convert MongoDB doc to LangChain Document with full structured metadata"""
project_name = clean_string(doc.get("projectName"))
project_type = clean_string(doc.get("projectType"))
project_category = clean_string(doc.get("projectCategory"))
slug = clean_string(doc.get("slug"))
status = clean_string(doc.get("status"))
bhk = clean_string(doc.get("type") or doc.get("customBHK"))
price = clean_numeric(doc.get("price"))
carpet_area = clean_numeric(doc.get("carpetArea"))
bathrooms = clean_numeric(doc.get("bathrooms"))
balcony = clean_numeric(doc.get("balcony"))
furnished = clean_string(doc.get("furnishedType"))
lift = doc.get("lift", False)
possession_date = clean_string(doc.get("possessionDate"))
amenities = clean_string(doc.get("aboutProperty"))
address = clean_string(doc.get("Address info"))
# Extract city/locality from slug (fallback if missing)
parts = slug.split("-") if slug else []
locality = parts[-3].capitalize() if len(parts) >= 3 else ""
city = parts[-2].capitalize() if len(parts) >= 2 else ""
# Page content for embeddings (can include any text you want LLM to use)
content = f"""
Project Name: {project_name}
Type: {bhk}
Status: {status}
Price: {price}
Carpet Area: {carpet_area}
Bathrooms: {bathrooms}
Balcony: {balcony}
Furnishing: {furnished}
Lift: {lift}
Location: {locality}, {city}
Address: {address}
Amenities: {amenities}
"""
# Structured metadata
metadata = {
"id": str(doc.get("_id")),
"slug": slug,
"projectName": project_name,
"projectType": project_type,
"projectCategory": project_category,
"status": status,
"BHK": bhk,
"price": price,
"price_in_cr": round(price / 10000000, 2) if price else None,
"carpetArea": carpet_area,
"bathrooms": bathrooms,
"balcony": balcony,
"furnishedType": furnished,
"lift": lift,
"possessionDate": possession_date,
"city": city,
"locality": locality,
"address": address,
"amenities": amenities,
"createdAt": doc.get("createdAt"),
"updatedAt": doc.get("updatedAt")
}
return Document(page_content=" ".join(content.split()), metadata=metadata)
# -----------------------------
# 4. Fetch & preprocess all docs
# -----------------------------
raw_docs = list(collection.find({}))
documents = [preprocess_document(doc) for doc in raw_docs]
print(f"Fetched {len(documents)} documents from MongoDB")
# -----------------------------
# 5. Chunk documents
# -----------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = text_splitter.split_documents(documents)
print(f"After chunking → {len(docs)} chunks")
# -----------------------------
# 6. Generate embeddings
# -----------------------------
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)
# -----------------------------
# 7. Save FAISS index & metadata separately
# -----------------------------
# Save FAISS vectorstore (index + metadata) into the folder
vectorstore.save_local(VECTORSTORE_DIR)
print(f"✅ FAISS vectors and metadata saved in {VECTORSTORE_DIR}")
|