File size: 4,701 Bytes
8630e6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import pymongo
import pickle
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from datetime import datetime

# -----------------------------
# 1. Load environment variables
# -----------------------------
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")  # Mongo connection
DB_NAME = os.getenv("DB_NAME", "company_chatbot")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "processed_data")
VECTORSTORE_DIR = "../vectorstore"
os.makedirs(VECTORSTORE_DIR, exist_ok=True)

# -----------------------------
# 2. Connect to MongoDB
# -----------------------------
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

# -----------------------------
# 3. Preprocessing helpers
# -----------------------------
def clean_string(val):
    if val is None:
        return ""
    return str(val).replace('"', '').replace("'", '').strip()

def clean_numeric(val):
    try:
        return float(val)
    except:
        return None

def preprocess_document(doc):
    """Convert MongoDB doc to LangChain Document with full structured metadata"""
    project_name = clean_string(doc.get("projectName"))
    project_type = clean_string(doc.get("projectType"))
    project_category = clean_string(doc.get("projectCategory"))
    slug = clean_string(doc.get("slug"))
    status = clean_string(doc.get("status"))
    bhk = clean_string(doc.get("type") or doc.get("customBHK"))
    price = clean_numeric(doc.get("price"))
    carpet_area = clean_numeric(doc.get("carpetArea"))
    bathrooms = clean_numeric(doc.get("bathrooms"))
    balcony = clean_numeric(doc.get("balcony"))
    furnished = clean_string(doc.get("furnishedType"))
    lift = doc.get("lift", False)
    possession_date = clean_string(doc.get("possessionDate"))
    amenities = clean_string(doc.get("aboutProperty"))
    address = clean_string(doc.get("Address info"))

    # Extract city/locality from slug (fallback if missing)
    parts = slug.split("-") if slug else []
    locality = parts[-3].capitalize() if len(parts) >= 3 else ""
    city = parts[-2].capitalize() if len(parts) >= 2 else ""

    # Page content for embeddings (can include any text you want LLM to use)
    content = f"""

    Project Name: {project_name}

    Type: {bhk}

    Status: {status}

    Price: {price}

    Carpet Area: {carpet_area}

    Bathrooms: {bathrooms}

    Balcony: {balcony}

    Furnishing: {furnished}

    Lift: {lift}

    Location: {locality}, {city}

    Address: {address}

    Amenities: {amenities}

    """

    # Structured metadata
    metadata = {
        "id": str(doc.get("_id")),
        "slug": slug,
        "projectName": project_name,
        "projectType": project_type,
        "projectCategory": project_category,
        "status": status,
        "BHK": bhk,
        "price": price,
        "price_in_cr": round(price / 10000000, 2) if price else None,
        "carpetArea": carpet_area,
        "bathrooms": bathrooms,
        "balcony": balcony,
        "furnishedType": furnished,
        "lift": lift,
        "possessionDate": possession_date,
        "city": city,
        "locality": locality,
        "address": address,
        "amenities": amenities,
        "createdAt": doc.get("createdAt"),
        "updatedAt": doc.get("updatedAt")
    }

    return Document(page_content=" ".join(content.split()), metadata=metadata)


# -----------------------------
# 4. Fetch & preprocess all docs
# -----------------------------
raw_docs = list(collection.find({}))
documents = [preprocess_document(doc) for doc in raw_docs]
print(f"Fetched {len(documents)} documents from MongoDB")

# -----------------------------
# 5. Chunk documents
# -----------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = text_splitter.split_documents(documents)
print(f"After chunking → {len(docs)} chunks")

# -----------------------------
# 6. Generate embeddings
# -----------------------------
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)

# -----------------------------
# 7. Save FAISS index & metadata separately
# -----------------------------
# Save FAISS vectorstore (index + metadata) into the folder
vectorstore.save_local(VECTORSTORE_DIR)
print(f"✅ FAISS vectors and metadata saved in {VECTORSTORE_DIR}")