# chatbot.py import os import re import json from typing import Dict, Any, List, Tuple, Optional from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema import HumanMessage from langchain_groq import ChatGroq # groq LLM wrapper from langchain.schema import Document from dotenv import load_dotenv load_dotenv() # -------- Safe absolute path ---------- PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # project root VECTORSTORE_DIR = os.getenv("VECTORSTORE_DIR", os.path.join(PROJECT_ROOT, "vectorstore")) GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Load embeddings & vectorstore embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") faiss_index_path = os.path.join(VECTORSTORE_DIR, "index.faiss") if not os.path.exists(faiss_index_path): raise FileNotFoundError(f"FAISS index not found at {faiss_index_path}") db = FAISS.load_local(VECTORSTORE_DIR, embeddings, allow_dangerous_deserialization=True) # Instantiate Groq LLM llm = ChatGroq( api_key=GROQ_API_KEY, model="llama-3.1-8b-instant" ) # --------------------------- # 1) Query parsing helpers # --------------------------- def parse_budget(text: str) -> Optional[float]: """ Parse budgets like: - "under ₹1.2 Cr" -> returns numeric rupees (float) e.g. 12000000 - "under 1.2cr", "under 12000000" Returns numeric rupee value or None. """ if not text: return None s = text.replace(",", "").lower() # find ₹ or rupee symbols and numbers m = re.search(r"under\s*[₹rs\.]*\s*([0-9]+(?:\.[0-9]+)?)\s*(cr|crore|l|lakhs|lakh|k)?", s) if m: num = float(m.group(1)) unit = (m.group(2) or "").lower() if unit in ("cr", "crore"): return num * 1e7 if unit in ("l", "lakh", "lakhs"): return num * 1e5 if unit in ("k",): return num * 1e3 # if no unit assume rupees raw return num # alternative: find direct rupee integers like 12000000 m2 = re.search(r"([0-9]{6,})", s) if m2: return float(m2.group(1)) return None def parse_bhk(text: str) -> Optional[str]: """Return like '2BHK' or '3BHK' if mentioned.""" if not text: return None m = re.search(r"(\d+)\s*-?\s*bhk", text.lower()) if m: return f"{m.group(1)}BHK" return None def parse_city(text: str) -> Optional[str]: """Very simple city detection — looks for common city names in text.""" if not text: return None s = text.lower() # extend this list as you need cities = ["pune", "mumbai", "delhi", "bangalore", "bangaluru", "chennai", "hyderabad", "kolkata"] for c in cities: if c in s: # standardize "bangaluru" -> "Bangalore" etc if you prefer return c.capitalize() if c != "bangaluru" else "Bangalore" return None def parse_status(text: str) -> Optional[str]: """Detect readiness intents""" s = text.lower() if "ready" in s or "ready to move" in s or "ready-to-move" in s: return "READY_TO_MOVE" if "under construction" in s or "uc" in s or "under-construction" in s: return "UNDER_CONSTRUCTION" return None def parse_locality_or_project(text: str) -> Optional[str]: """Pick up locality words (heuristic). Returns substring if found after 'in' or 'near'.""" if not text: return None m = re.search(r"(?:in|near|at)\s+([a-zA-Z0-9\- ]{3,30})", text.lower()) if m: return m.group(1).strip().title() return None def parse_query(query: str) -> Dict[str, Any]: """Aggregate all parsed filters.""" return { "raw": query, "budget_rupees": parse_budget(query), "bhk": parse_bhk(query), "city": parse_city(query), "status": parse_status(query), "locality_or_project": parse_locality_or_project(query), } # --------------------------- # 2) Search + deterministic filter # --------------------------- def semantic_search(query: str, k: int = 10) -> List[Document]: """ Run similarity search over FAISS and return top-k Document objects. """ return db.similarity_search(query, k=k) def apply_filters(docs: List[Document], filters: Dict[str, Any]) -> List[Document]: """Filter retrieved docs using structured metadata (price, city, BHK, status, locality).""" budget = filters.get("budget_rupees") bhk = filters.get("bhk") city = filters.get("city") status = filters.get("status") locality = filters.get("locality_or_project") def keep(doc: Document) -> bool: md = doc.metadata or {} # city filter if city: md_city = (md.get("city") or "").lower() if city.lower() not in md_city: return False # bhk filter if bhk: md_bhk = (md.get("BHK") or md.get("bhk") or "").lower() if bhk.lower() not in md_bhk: return False # price filter (budget_rupees) if budget is not None: price = md.get("price") or md.get("price_in_cr") if price is None: return False # price might be stored either in rupees (price) or in crores (price_in_cr) if md.get("price") is not None: try: if float(md.get("price")) > float(budget): return False except: return False else: # price_in_cr present try: if float(md.get("price_in_cr")) * 1e7 > float(budget): return False except: return False # status filter if status: md_status = (md.get("status") or "").lower() if status.lower() not in md_status: return False # locality filter — check in metadata locality or address or slug if locality: found = False for key in ("locality", "address", "slug", "projectName"): if key in md and md.get(key): if locality.lower() in str(md.get(key)).lower(): found = True break if not found: return False return True filtered = [d for d in docs if keep(d)] return filtered # --------------------------- # 3) Create summary + cards input (no hallucination) # --------------------------- def build_context_for_llm(docs: List[Document]) -> str: """ Build a compact, plain text context from the retrieved docs. We'll pass this to Groq LLM and instruct it to only use this data. """ lines = [] for i, d in enumerate(docs, 1): md = d.metadata or {} title = md.get("projectName") or md.get("slug") or "Unknown" locality = md.get("locality") or "" city = md.get("city") or "" bhk = md.get("BHK") or md.get("bhk") or "" price_cr = md.get("price_in_cr") price_rupee = md.get("price") price_str = (f"₹{round(price_cr,2)} Cr" if price_cr else (f"₹{int(price_rupee)}" if price_rupee else "N/A")) status = md.get("status") or "" amenities = md.get("amenities") or "" possession = md.get("possessionDate") or "" slug = md.get("slug") or "" lines.append( f"ITEM_{i} || title: {title} || city: {city} || locality: {locality} || bhk: {bhk} || price: {price_str} || status: {status} || possession: {possession} || amenities: {amenities} || slug: {slug}" ) return "\n".join(lines) # --------------------------- # 4) Prompt to Groq (strict, grounded) # --------------------------- from langchain.schema import HumanMessage import json, re def generate_summary_and_cards(user_query: str, records_text: str) -> dict: SUMMARY_PROMPT = f""" You are an assistant for NoBrokerage.com. You will be given property records. **INSTRUCTIONS:** - Use ONLY the information in the provided records (do not hallucinate). - Produce a JSON object with two keys: "summary" and "cards". - "summary": 2-4 sentences summarizing matching properties, including price, BHK, readiness, localities, counts. - "cards": list of at most 6 objects with keys: title, city_locality, bhk, price, project_name, possession_status, top_amenities (list of 1-3 strings), cta_url. - If no records match, return: {{"summary":"No matching properties found. I expanded the search and found X alternatives.","cards":[]}} Records: {records_text} User query: {user_query} """ # Call Groq LLM resp = llm.generate([[HumanMessage(content=SUMMARY_PROMPT)]]) # Extract text try: text = resp.generations[0][0].text except Exception: text = str(resp) # Parse JSON try: result_json = json.loads(text) except json.JSONDecodeError: # Attempt to extract JSON blob match = re.search(r"(\{.*\})", text, re.S) if match: try: result_json = json.loads(match.group(1)) except: result_json = {"summary": "Error: Could not parse LLM output as JSON.", "cards": []} else: result_json = {"summary": "Error: Could not parse LLM output as JSON.", "cards": []} # Ensure summary fallback is strictly formatted if not result_json.get("summary"): result_json["summary"] = f"No matching properties found for '{user_query}'." return result_json # --------------------------- # 5) Main handler # --------------------------- def handle_query(query: str, k: int = 12) -> Dict[str, Any]: """ Full pipeline: - parse query - semantic search (k) - deterministic filter - pass filtered results to LLM for summary + cards (LLM is forced to use only these records) """ parsed = parse_query(query) sem_docs = semantic_search(query, k=k) # apply deterministic metadata filter filtered = apply_filters(sem_docs, parsed) # If none after filtering, optionally expand search: use original sem_docs as fallback to_use = filtered if filtered else sem_docs[:6] # keep up to 6 for LLM context # Build plain records text for LLM records_text = build_context_for_llm(to_use) # If absolutely no documents at all: if len(to_use) == 0: return {"summary": "No matching properties found and no alternatives available.", "cards": []} llm_result = generate_summary_and_cards(query, records_text) # Ensure cards also include CTA built from slug if missing formatting cards = llm_result.get("cards", []) for c, doc in zip(cards, to_use): # ensure cta_url exists if not c.get("cta_url") or c.get("cta_url") == "": slug = doc.metadata.get("slug") or "" c["cta_url"] = f"/project/{slug}" return llm_result # --------------------------- # CLI interactive usage # --------------------------- if __name__ == "__main__": print("NoBrokerage Chatbot (Groq) — demo (grounded summary + cards).") print("Type 'exit' to quit.") while True: q = input("\nEnter user query: ").strip() if q.lower() in ("exit", "quit"): break out = handle_query(q) print("\n=== Summary ===") print(out.get("summary")) # print("\n=== Cards ===") # print(json.dumps(out.get("cards", []), indent=2))