PROPERT_AI / src /ingest.py
Subhakanta
Deploy chatbot FastAPI app
8630e6c
import os
import pymongo
import pickle
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from datetime import datetime
# -----------------------------
# 1. Load environment variables
# -----------------------------
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI") # Mongo connection
DB_NAME = os.getenv("DB_NAME", "company_chatbot")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "processed_data")
VECTORSTORE_DIR = "../vectorstore"
os.makedirs(VECTORSTORE_DIR, exist_ok=True)
# -----------------------------
# 2. Connect to MongoDB
# -----------------------------
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]
# -----------------------------
# 3. Preprocessing helpers
# -----------------------------
def clean_string(val):
if val is None:
return ""
return str(val).replace('"', '').replace("'", '').strip()
def clean_numeric(val):
try:
return float(val)
except:
return None
def preprocess_document(doc):
"""Convert MongoDB doc to LangChain Document with full structured metadata"""
project_name = clean_string(doc.get("projectName"))
project_type = clean_string(doc.get("projectType"))
project_category = clean_string(doc.get("projectCategory"))
slug = clean_string(doc.get("slug"))
status = clean_string(doc.get("status"))
bhk = clean_string(doc.get("type") or doc.get("customBHK"))
price = clean_numeric(doc.get("price"))
carpet_area = clean_numeric(doc.get("carpetArea"))
bathrooms = clean_numeric(doc.get("bathrooms"))
balcony = clean_numeric(doc.get("balcony"))
furnished = clean_string(doc.get("furnishedType"))
lift = doc.get("lift", False)
possession_date = clean_string(doc.get("possessionDate"))
amenities = clean_string(doc.get("aboutProperty"))
address = clean_string(doc.get("Address info"))
# Extract city/locality from slug (fallback if missing)
parts = slug.split("-") if slug else []
locality = parts[-3].capitalize() if len(parts) >= 3 else ""
city = parts[-2].capitalize() if len(parts) >= 2 else ""
# Page content for embeddings (can include any text you want LLM to use)
content = f"""
Project Name: {project_name}
Type: {bhk}
Status: {status}
Price: {price}
Carpet Area: {carpet_area}
Bathrooms: {bathrooms}
Balcony: {balcony}
Furnishing: {furnished}
Lift: {lift}
Location: {locality}, {city}
Address: {address}
Amenities: {amenities}
"""
# Structured metadata
metadata = {
"id": str(doc.get("_id")),
"slug": slug,
"projectName": project_name,
"projectType": project_type,
"projectCategory": project_category,
"status": status,
"BHK": bhk,
"price": price,
"price_in_cr": round(price / 10000000, 2) if price else None,
"carpetArea": carpet_area,
"bathrooms": bathrooms,
"balcony": balcony,
"furnishedType": furnished,
"lift": lift,
"possessionDate": possession_date,
"city": city,
"locality": locality,
"address": address,
"amenities": amenities,
"createdAt": doc.get("createdAt"),
"updatedAt": doc.get("updatedAt")
}
return Document(page_content=" ".join(content.split()), metadata=metadata)
# -----------------------------
# 4. Fetch & preprocess all docs
# -----------------------------
raw_docs = list(collection.find({}))
documents = [preprocess_document(doc) for doc in raw_docs]
print(f"Fetched {len(documents)} documents from MongoDB")
# -----------------------------
# 5. Chunk documents
# -----------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = text_splitter.split_documents(documents)
print(f"After chunking → {len(docs)} chunks")
# -----------------------------
# 6. Generate embeddings
# -----------------------------
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)
# -----------------------------
# 7. Save FAISS index & metadata separately
# -----------------------------
# Save FAISS vectorstore (index + metadata) into the folder
vectorstore.save_local(VECTORSTORE_DIR)
print(f"✅ FAISS vectors and metadata saved in {VECTORSTORE_DIR}")