Spaces:

csong03
/

14c_chatbot

Runtime error

csong03

Initial Space upload with LFS-tracked binaries

9e118e4 3 months ago

16.9 kB

	"""
	BPS School Finder - Database Query Interface
	==============================================
	Runtime query class for the chatbot. Import this module:

	from database import BPSDatabase
	db = BPSDatabase()
	"""

	import sqlite3
	import json
	import math
	from pathlib import Path

	# ---------------------------------------------------------------------------
	# Config
	# ---------------------------------------------------------------------------
	DB_PATH = Path(__file__).parent / "bps_schools.db"
	VECTOR_DIR = Path(__file__).parent / "vector_store"
	EMBEDDING_MODEL = "all-MiniLM-L6-v2"


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def haversine_miles(lat1, lon1, lat2, lon2):
	"""Calculate distance in miles between two lat/lon points."""
	R = 3958.8
	dlat = math.radians(lat2 - lat1)
	dlon = math.radians(lon2 - lon1)
	a = (math.sin(dlat / 2) ** 2 +
	math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
	math.sin(dlon / 2) ** 2)
	return R * 2 * math.asin(math.sqrt(a))


	# ---------------------------------------------------------------------------
	# Query Interface
	# ---------------------------------------------------------------------------

	class BPSDatabase:
	"""Query interface for the chatbot."""

	def __init__(self, db_path=None, vector_dir=None):
	self.db_path = db_path or DB_PATH
	self.vector_dir = vector_dir or VECTOR_DIR
	self.conn = sqlite3.connect(str(self.db_path))
	self.conn.row_factory = sqlite3.Row

	self.index = None
	self.documents = None
	self.metadata = None
	self.model = None
	self._load_vector_store()

	def _load_vector_store(self):
	"""Load FAISS index and metadata."""
	try:
	meta_path = Path(self.vector_dir) / "metadata.json"
	docs_path = Path(self.vector_dir) / "documents.json"
	index_path = Path(self.vector_dir) / "school_index.faiss"

	if meta_path.exists():
	with open(meta_path) as f:
	self.metadata = json.load(f)
	with open(docs_path) as f:
	self.documents = json.load(f)

	if index_path.exists():
	import faiss
	self.index = faiss.read_index(str(index_path))
	from sentence_transformers import SentenceTransformer
	self.model = SentenceTransformer(EMBEDDING_MODEL)
	print(f"Vector store loaded: {self.index.ntotal} schools indexed")
	except Exception as e:
	print(f"Vector store not fully loaded: {e}")

	# --- Hard Filtering (SQL) ---

	def find_schools_by_grade(self, grade: int) -> list:
	"""Find BPS schools that serve a specific grade level.
	Grade encoding: K0=-2, K1=-1, K2=0, 1-12 as integers."""
	cur = self.conn.execute(
	"SELECT * FROM schools WHERE grade_min <= ? AND grade_max >= ? ORDER BY school",
	(grade, grade),
	)
	return [dict(r) for r in cur.fetchall()]

	def find_schools_by_age(self, age_months: int) -> list:
	"""Find non-BPS schools that serve a specific age in months."""
	cur = self.conn.execute(
	"SELECT * FROM schools WHERE age_min_months <= ? AND age_max_months >= ? ORDER BY school",
	(age_months, age_months),
	)
	return [dict(r) for r in cur.fetchall()]

	def find_schools_near(self, lat: float, lon: float, radius_miles: float = 1.0) -> list:
	"""Find schools within radius_miles of a given lat/lon."""
	lat_delta = radius_miles / 69.0
	lon_delta = radius_miles / 53.0

	cur = self.conn.execute(
	"""SELECT * FROM schools
	WHERE latitude BETWEEN ? AND ?
	AND longitude BETWEEN ? AND ?""",
	(lat - lat_delta, lat + lat_delta, lon - lon_delta, lon + lon_delta),
	)

	results = []
	for row in cur.fetchall():
	d = dict(row)
	if d["latitude"] and d["longitude"]:
	dist = haversine_miles(lat, lon, d["latitude"], d["longitude"])
	if dist <= radius_miles:
	d["distance_miles"] = round(dist, 2)
	results.append(d)

	results.sort(key=lambda x: x["distance_miles"])
	return results

	def find_schools_by_provider_type(self, provider_type: str) -> list:
	"""Find schools by provider type (exact match)."""
	cur = self.conn.execute(
	"SELECT * FROM schools WHERE provider_type = ? ORDER BY school",
	(provider_type,),
	)
	return [dict(r) for r in cur.fetchall()]

	def find_schools_by_filters(self, **kwargs) -> list:
	"""
	Combined AND filter for boolean/integer fields.
	Supported kwargs: UPK, ADA, accepts_ccfa, headstart, has_language_program,
	has_advanced_placement, has_international_baccalaureate, uniform,
	special_admission, surround_care, build_care, tuition.
	"""
	allowed = {
	"UPK", "ADA", "accepts_ccfa", "headstart", "has_language_program",
	"has_advanced_placement", "has_international_baccalaureate", "uniform",
	"special_admission", "surround_care", "build_care", "tuition",
	}

	query = "SELECT * FROM schools WHERE 1=1"
	params = []

	for key, val in kwargs.items():
	if key in allowed and val is not None:
	query += f" AND {key} = ?"
	params.append(val)

	cur = self.conn.execute(query + " ORDER BY school", params)
	return [dict(r) for r in cur.fetchall()]

	def hard_filter(self, grade: int = None, age_months: int = None,
	provider_type: str = None, lat: float = None, lon: float = None,
	radius_miles: float = 1.0, **boolean_filters) -> list:
	"""
	Combined hard filter. All specified conditions must match (AND logic).
	"""
	query = "SELECT * FROM schools WHERE 1=1"
	params = []

	if grade is not None:
	query += " AND grade_min <= ? AND grade_max >= ?"
	params.extend([grade, grade])

	if age_months is not None:
	query += " AND age_min_months <= ? AND age_max_months >= ?"
	params.extend([age_months, age_months])

	if provider_type:
	query += " AND provider_type = ?"
	params.append(provider_type)

	allowed_bools = {
	"UPK", "ADA", "accepts_ccfa", "headstart", "has_language_program",
	"has_advanced_placement", "has_international_baccalaureate", "uniform",
	"special_admission", "surround_care", "build_care", "tuition",
	}
	for key, val in boolean_filters.items():
	if key in allowed_bools and val is not None:
	query += f" AND {key} = ?"
	params.append(val)

	cur = self.conn.execute(query + " ORDER BY school", params)
	results = [dict(r) for r in cur.fetchall()]

	# Post-filter by distance if location provided
	if lat is not None and lon is not None:
	filtered = []
	for r in results:
	if r["latitude"] and r["longitude"]:
	dist = haversine_miles(lat, lon, r["latitude"], r["longitude"])
	if dist <= radius_miles:
	r["distance_miles"] = round(dist, 2)
	filtered.append(r)
	results = sorted(filtered, key=lambda x: x["distance_miles"])

	return results

	# --- Soft Filtering (Vector Search / RAG) ---

	def semantic_search(self, query: str, top_k: int = 10,
	pre_filter_ids: set = None) -> list:
	"""
	Semantic search over BPS school descriptions.

	Args:
	query: Natural language query
	top_k: Number of results to return
	pre_filter_ids: If provided, only search within these school IDs
	"""
	if self.index is None or self.model is None:
	return self._keyword_search(query, top_k, pre_filter_ids)

	import numpy as np

	query_vec = self.model.encode([query], normalize_embeddings=True)
	query_vec = np.array(query_vec).astype("float32")

	search_k = min(top_k * 5, self.index.ntotal) if pre_filter_ids else top_k
	scores, indices = self.index.search(query_vec, search_k)

	results = []
	for score, idx in zip(scores[0], indices[0]):
	if idx < 0:
	continue
	meta = self.metadata[idx]

	if pre_filter_ids and meta["id"] not in pre_filter_ids:
	continue

	results.append({
	"id": meta["id"],
	"school": meta["school"],
	"score": float(score),
	"description": self.documents[idx],
	"metadata": meta,
	})

	if len(results) >= top_k:
	break

	return results

	def _keyword_search(self, query: str, top_k: int = 10,
	pre_filter_ids: set = None) -> list:
	"""Fallback keyword search when FAISS/embeddings not available."""
	if not self.documents:
	return []

	query_words = set(query.lower().split())
	scored = []

	for i, (doc, meta) in enumerate(zip(self.documents, self.metadata)):
	if pre_filter_ids and meta["id"] not in pre_filter_ids:
	continue

	doc_lower = doc.lower()
	score = sum(1 for w in query_words if w in doc_lower)
	if score > 0:
	scored.append({
	"id": meta["id"],
	"school": meta["school"],
	"score": score / len(query_words),
	"description": doc,
	"metadata": meta,
	})

	scored.sort(key=lambda x: x["score"], reverse=True)
	return scored[:top_k]

	# --- Combined Hard + Soft Filter ---

	def search(self, query: str = None, grade: int = None, provider_type: str = None,
	lat: float = None, lon: float = None, radius_miles: float = 1.0,
	top_k: int = 10, **filters) -> list:
	"""
	Full search pipeline:
	1. Apply hard filters (grade, provider_type, location, booleans)
	2. Within hard-filtered results, rank by semantic similarity to query
	"""
	hard_results = self.hard_filter(
	grade=grade, provider_type=provider_type,
	lat=lat, lon=lon, radius_miles=radius_miles, **filters,
	)

	if not query or not query.strip():
	return hard_results[:top_k]

	eligible_ids = set(r["id"] for r in hard_results)

	if not eligible_ids:
	return self.semantic_search(query, top_k)

	soft_results = self.semantic_search(query, top_k, eligible_ids)

	dist_map = {r["id"]: r.get("distance_miles") for r in hard_results}
	for r in soft_results:
	if dist_map.get(r["id"]) is not None:
	r["distance_miles"] = dist_map[r["id"]]

	return soft_results

	# --- Filter Based on Preferences ---

	def filter_based_on_preferences(self, school_ids: list, **preferences) -> dict:
	"""
	Filter eligible school IDs by preferences. Returns results split by
	BPS vs non-BPS, with BPS schools semantically ranked when a query is provided.

	Args:
	school_ids: List of school IDs (from find_eligible_schools). Required, cannot be empty.
	**preferences: Optional filters (query, top_k, ADA, UPK, has_language_program, etc.)

	Returns:
	Dict with bps_schools, non_bps_schools, bps_count, non_bps_count, notes.
	"""
	if not school_ids:
	return {
	"error": "school_ids is required and cannot be empty. Call find_eligible_schools first to get eligible school IDs.",
	"bps_schools": [],
	"non_bps_schools": [],
	"notes": [],
	}

	query = preferences.pop("query", None)
	top_k = preferences.pop("top_k", 10)
	lat = preferences.pop("lat", None)
	lon = preferences.pop("lon", None)
	radius_miles = preferences.pop("radius_miles", 1.0)
	provider_type = preferences.pop("provider_type", None)

	# Build SQL with ID constraint + optional filters
	placeholders = ",".join("?" for _ in school_ids)
	sql = f"SELECT * FROM schools WHERE id IN ({placeholders})"
	params = list(school_ids)

	if provider_type:
	sql += " AND provider_type = ?"
	params.append(provider_type)

	allowed_bools = {
	"UPK", "ADA", "accepts_ccfa", "headstart", "has_language_program",
	"has_advanced_placement", "has_international_baccalaureate", "uniform",
	"special_admission", "surround_care", "build_care", "tuition",
	}
	for key, val in preferences.items():
	if key in allowed_bools and val is not None:
	sql += f" AND {key} = ?"
	params.append(val)

	cur = self.conn.execute(sql + " ORDER BY school", params)
	rows = [dict(r) for r in cur.fetchall()]

	# Post-filter by distance
	if lat is not None and lon is not None:
	filtered = []
	for r in rows:
	if r["latitude"] and r["longitude"]:
	dist = haversine_miles(lat, lon, r["latitude"], r["longitude"])
	if dist <= radius_miles:
	r["distance_miles"] = round(dist, 2)
	filtered.append(r)
	rows = sorted(filtered, key=lambda x: x["distance_miles"])

	# Split BPS vs non-BPS
	bps_rows = [r for r in rows if r["provider_type"] == "Boston Public School"]
	non_bps_rows = [r for r in rows if r["provider_type"] != "Boston Public School"]

	bps_count = len(bps_rows)
	non_bps_count = len(non_bps_rows)
	notes = []

	# Semantic ranking for BPS if query provided
	if query and bps_rows:
	bps_ids = set(str(r["id"]) for r in bps_rows)
	ranked = self.semantic_search(query, top_k=len(bps_rows), pre_filter_ids=bps_ids)

	ranked_ids = [r["id"] for r in ranked]
	score_map = {r["id"]: r["score"] for r in ranked}

	# Build ranked BPS list: ranked first, then unranked
	bps_by_id = {str(r["id"]): r for r in bps_rows}
	ranked_bps = []
	for rid in ranked_ids:
	if str(rid) in bps_by_id:
	row = bps_by_id.pop(str(rid))
	row["score"] = score_map[rid]
	ranked_bps.append(row)
	# Append any BPS schools not in FAISS results
	for row in sorted(bps_by_id.values(), key=lambda x: x["school"]):
	ranked_bps.append(row)

	bps_rows = ranked_bps

	if query and non_bps_count > 0:
	notes.append(
	f"Non-BPS schools cannot be ranked by their semantic preference "
	f"'{query}' because detailed program descriptions are not publicly "
	f"available for these providers, tell this to the user."
	)

	if bps_count == 0 and non_bps_count == 0:
	notes.append(
	"No schools matched the given filters. Tell the user to broaden "
	"their criteria or to reach out to their eligible schools for more information."
	)

	return {
	"bps_schools": bps_rows[:top_k],
	"non_bps_schools": non_bps_rows[:top_k],
	"bps_count": bps_count,
	"non_bps_count": non_bps_count,
	"notes": notes,
	}

	# --- Utility methods ---

	def get_school_detail(self, school_id: str) -> dict:
	"""Get full school record + RAG description if available."""
	cur = self.conn.execute("SELECT * FROM schools WHERE id = ?", (school_id,))
	row = cur.fetchone()
	if row:
	result = dict(row)
	if self.metadata:
	for i, m in enumerate(self.metadata):
	if m["id"] == school_id:
	result["description"] = self.documents[i]
	break
	return result
	return None

	def get_all_provider_types(self) -> list:
	"""Get distinct provider types."""
	cur = self.conn.execute(
	"SELECT DISTINCT provider_type FROM schools ORDER BY provider_type"
	)
	return [r[0] for r in cur.fetchall()]

	def close(self):
	self.conn.close()