Spaces:

Subhakanta156
/

PROPERT_AI

Sleeping

PROPERT_AI / src /ingest.py

Subhakanta

Deploy chatbot FastAPI app

8630e6c 4 months ago

4.7 kB

	import os
	import pymongo
	import pickle
	from dotenv import load_dotenv
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema import Document
	from datetime import datetime

	# -----------------------------
	# 1. Load environment variables
	# -----------------------------
	load_dotenv()
	MONGO_URI = os.getenv("MONGO_URI") # Mongo connection
	DB_NAME = os.getenv("DB_NAME", "company_chatbot")
	COLLECTION_NAME = os.getenv("COLLECTION_NAME", "processed_data")
	VECTORSTORE_DIR = "../vectorstore"
	os.makedirs(VECTORSTORE_DIR, exist_ok=True)

	# -----------------------------
	# 2. Connect to MongoDB
	# -----------------------------
	client = pymongo.MongoClient(MONGO_URI)
	db = client[DB_NAME]
	collection = db[COLLECTION_NAME]

	# -----------------------------
	# 3. Preprocessing helpers
	# -----------------------------
	def clean_string(val):
	if val is None:
	return ""
	return str(val).replace('"', '').replace("'", '').strip()

	def clean_numeric(val):
	try:
	return float(val)
	except:
	return None

	def preprocess_document(doc):
	"""Convert MongoDB doc to LangChain Document with full structured metadata"""
	project_name = clean_string(doc.get("projectName"))
	project_type = clean_string(doc.get("projectType"))
	project_category = clean_string(doc.get("projectCategory"))
	slug = clean_string(doc.get("slug"))
	status = clean_string(doc.get("status"))
	bhk = clean_string(doc.get("type") or doc.get("customBHK"))
	price = clean_numeric(doc.get("price"))
	carpet_area = clean_numeric(doc.get("carpetArea"))
	bathrooms = clean_numeric(doc.get("bathrooms"))
	balcony = clean_numeric(doc.get("balcony"))
	furnished = clean_string(doc.get("furnishedType"))
	lift = doc.get("lift", False)
	possession_date = clean_string(doc.get("possessionDate"))
	amenities = clean_string(doc.get("aboutProperty"))
	address = clean_string(doc.get("Address info"))

	# Extract city/locality from slug (fallback if missing)
	parts = slug.split("-") if slug else []
	locality = parts[-3].capitalize() if len(parts) >= 3 else ""
	city = parts[-2].capitalize() if len(parts) >= 2 else ""

	# Page content for embeddings (can include any text you want LLM to use)
	content = f"""
	Project Name: {project_name}
	Type: {bhk}
	Status: {status}
	Price: {price}
	Carpet Area: {carpet_area}
	Bathrooms: {bathrooms}
	Balcony: {balcony}
	Furnishing: {furnished}
	Lift: {lift}
	Location: {locality}, {city}
	Address: {address}
	Amenities: {amenities}
	"""

	# Structured metadata
	metadata = {
	"id": str(doc.get("_id")),
	"slug": slug,
	"projectName": project_name,
	"projectType": project_type,
	"projectCategory": project_category,
	"status": status,
	"BHK": bhk,
	"price": price,
	"price_in_cr": round(price / 10000000, 2) if price else None,
	"carpetArea": carpet_area,
	"bathrooms": bathrooms,
	"balcony": balcony,
	"furnishedType": furnished,
	"lift": lift,
	"possessionDate": possession_date,
	"city": city,
	"locality": locality,
	"address": address,
	"amenities": amenities,
	"createdAt": doc.get("createdAt"),
	"updatedAt": doc.get("updatedAt")
	}

	return Document(page_content=" ".join(content.split()), metadata=metadata)


	# -----------------------------
	# 4. Fetch & preprocess all docs
	# -----------------------------
	raw_docs = list(collection.find({}))
	documents = [preprocess_document(doc) for doc in raw_docs]
	print(f"Fetched {len(documents)} documents from MongoDB")

	# -----------------------------
	# 5. Chunk documents
	# -----------------------------
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
	docs = text_splitter.split_documents(documents)
	print(f"After chunking → {len(docs)} chunks")

	# -----------------------------
	# 6. Generate embeddings
	# -----------------------------
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = FAISS.from_documents(docs, embedding_model)

	# -----------------------------
	# 7. Save FAISS index & metadata separately
	# -----------------------------
	# Save FAISS vectorstore (index + metadata) into the folder
	vectorstore.save_local(VECTORSTORE_DIR)
	print(f"✅ FAISS vectors and metadata saved in {VECTORSTORE_DIR}")