Spaces:

Subhakanta156
/

PROPERT_AI

Sleeping

File size: 4,701 Bytes

8630e6c

import os
import pymongo
import pickle
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from datetime import datetime

# -----------------------------
# 1. Load environment variables
# -----------------------------
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")  # Mongo connection
DB_NAME = os.getenv("DB_NAME", "company_chatbot")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "processed_data")
VECTORSTORE_DIR = "../vectorstore"
os.makedirs(VECTORSTORE_DIR, exist_ok=True)

# -----------------------------
# 2. Connect to MongoDB
# -----------------------------
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

# -----------------------------
# 3. Preprocessing helpers
# -----------------------------
def clean_string(val):
    if val is None:
        return ""
    return str(val).replace('"', '').replace("'", '').strip()

def clean_numeric(val):
    try:
        return float(val)
    except:
        return None

def preprocess_document(doc):
    """Convert MongoDB doc to LangChain Document with full structured metadata"""
    project_name = clean_string(doc.get("projectName"))
    project_type = clean_string(doc.get("projectType"))
    project_category = clean_string(doc.get("projectCategory"))
    slug = clean_string(doc.get("slug"))
    status = clean_string(doc.get("status"))
    bhk = clean_string(doc.get("type") or doc.get("customBHK"))
    price = clean_numeric(doc.get("price"))
    carpet_area = clean_numeric(doc.get("carpetArea"))
    bathrooms = clean_numeric(doc.get("bathrooms"))
    balcony = clean_numeric(doc.get("balcony"))
    furnished = clean_string(doc.get("furnishedType"))
    lift = doc.get("lift", False)
    possession_date = clean_string(doc.get("possessionDate"))
    amenities = clean_string(doc.get("aboutProperty"))
    address = clean_string(doc.get("Address info"))

    # Extract city/locality from slug (fallback if missing)
    parts = slug.split("-") if slug else []
    locality = parts[-3].capitalize() if len(parts) >= 3 else ""
    city = parts[-2].capitalize() if len(parts) >= 2 else ""

    # Page content for embeddings (can include any text you want LLM to use)
    content = f"""

    Project Name: {project_name}

    Type: {bhk}

    Status: {status}

    Price: {price}

    Carpet Area: {carpet_area}

    Bathrooms: {bathrooms}

    Balcony: {balcony}

    Furnishing: {furnished}

    Lift: {lift}

    Location: {locality}, {city}

    Address: {address}

    Amenities: {amenities}

    """

    # Structured metadata
    metadata = {
        "id": str(doc.get("_id")),
        "slug": slug,
        "projectName": project_name,
        "projectType": project_type,
        "projectCategory": project_category,
        "status": status,
        "BHK": bhk,
        "price": price,
        "price_in_cr": round(price / 10000000, 2) if price else None,
        "carpetArea": carpet_area,
        "bathrooms": bathrooms,
        "balcony": balcony,
        "furnishedType": furnished,
        "lift": lift,
        "possessionDate": possession_date,
        "city": city,
        "locality": locality,
        "address": address,
        "amenities": amenities,
        "createdAt": doc.get("createdAt"),
        "updatedAt": doc.get("updatedAt")
    }

    return Document(page_content=" ".join(content.split()), metadata=metadata)


# -----------------------------
# 4. Fetch & preprocess all docs
# -----------------------------
raw_docs = list(collection.find({}))
documents = [preprocess_document(doc) for doc in raw_docs]
print(f"Fetched {len(documents)} documents from MongoDB")

# -----------------------------
# 5. Chunk documents
# -----------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = text_splitter.split_documents(documents)
print(f"After chunking → {len(docs)} chunks")

# -----------------------------
# 6. Generate embeddings
# -----------------------------
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)

# -----------------------------
# 7. Save FAISS index & metadata separately
# -----------------------------
# Save FAISS vectorstore (index + metadata) into the folder
vectorstore.save_local(VECTORSTORE_DIR)
print(f"✅ FAISS vectors and metadata saved in {VECTORSTORE_DIR}")