Spaces:

kevansoon
/

ollama-api-proxy

Sleeping

App Files Files Community

GitHub Actions commited on Jan 25

Commit

1d32142

0 Parent(s):

Sync from GitHub

Browse files

Files changed (27) hide show

Dockerfile +29 -0
README.md +49 -0
agents/__init__.py +19 -0
agents/agentic_rag.py +231 -0
agents/base.py +78 -0
agents/charity_search.py +111 -0
agents/classifier.py +71 -0
agents/logical.py +29 -0
agents/therapist.py +24 -0
app.py +1817 -0
encoders/__init__.py +5 -0
encoders/base.py +43 -0
encoders/sealion.py +382 -0
graph/__init__.py +6 -0
graph/builder.py +123 -0
graph/router.py +15 -0
graph/state.py +8 -0
recommender/__init__.py +29 -0
recommender/gis_recommender.py +1202 -0
recommender/vector_store.py +404 -0
requirements.txt +21 -0
test_agentic_rag.py +222 -0
test_api.py +95 -0
test_gis.py +212 -0
tools/__init__.py +40 -0
tools/rag_tools.py +406 -0
tools/web_search.py +172 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app.py .
+COPY graph/ ./graph/
+COPY agents/ ./agents/
+COPY encoders/ ./encoders/
+COPY recommender/ ./recommender/
+COPY tools/ ./tools/
+# Create non-root user for security (required by HF Spaces)
+RUN useradd -m -u 1000 user
+USER user
+# HF Spaces expects port 7860
+EXPOSE 7860
+# Run the FastAPI app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+---
+title: Ollama API Proxy
+emoji: 🦙
+colorFrom: purple
+colorTo: blue
+sdk: docker
+pinned: false
+---
+# Ollama API Proxy
+A FastAPI-based proxy for the Ollama API hosted on Hugging Face Spaces.
+## Endpoints
+### GET /
+Health check endpoint returning service status.
+### GET /health
+Simple health check endpoint.
+### POST /chat
+Send a chat message to the Ollama API.
+**Request Body:**
+```json
+{
+  "message": "Your message here",
+  "model": "gpt-oss:120b",
+  "stream": true
+}
+```
+**Response (non-streaming):**
+```json
+{
+  "response": "The AI response"
+}
+```
+## Environment Variables
+- `OLLAMA_API_KEY`: Your Ollama API key (set as a secret in HF Spaces)
+## Setup
+1. Create a new Space on Hugging Face with Docker SDK
+2. Add `OLLAMA_API_KEY` as a repository secret
+3. Push this code to the Space repository

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""LangGraph agent nodes."""
+from .base import BaseMemoryAgent
+from .classifier import create_classifier, classify_message
+from .therapist import TherapistAgent
+from .logical import LogicalAgent
+from .charity_search import CharitySearchAgent, create_charity_search_agent
+from .agentic_rag import AgenticRAGAgent, create_agentic_rag_agent
+__all__ = [
+    "BaseMemoryAgent",
+    "create_classifier",
+    "classify_message",
+    "TherapistAgent",
+    "LogicalAgent",
+    "CharitySearchAgent",
+    "create_charity_search_agent",
+    "AgenticRAGAgent",
+    "create_agentic_rag_agent",
+]

agents/agentic_rag.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""Agentic RAG Agent for autonomous vector store exploration.
+This agent uses a ReAct loop to iteratively explore the vector database,
+making autonomous decisions about how to search, filter, and refine results.
+"""
+import uuid
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+from langgraph.prebuilt import create_react_agent
+from langgraph.store.base import BaseStore
+from langchain_core.runnables import RunnableConfig
+from langchain_core.messages import HumanMessage, SystemMessage
+from tools.rag_tools import RAG_TOOLS, set_rag_dependencies
+AGENTIC_RAG_SYSTEM_PROMPT = """You are an intelligent research agent with access to a vector database containing donor and volunteer profiles.
+Your goal is to help users find the most relevant matches by autonomously exploring the database.
+## Available Tools
+1. **list_available_categories** - Start here to understand what data exists (countries, causes, types)
+2. **get_statistics** - Get database size and composition
+3. **semantic_search** - Find profiles by natural language query
+4. **filter_by_metadata** - Browse by specific field values (country, form_type, etc.)
+5. **hybrid_search** - Combine semantic search with filters for precise results
+6. **get_document_by_id** - Get full details of a specific profile
+## Search Strategy
+Follow this iterative exploration process:
+1. **Understand the Request**: Parse what the user is looking for
+2. **Explore Categories**: Use list_available_categories to see what's available
+3. **Initial Search**: Start with semantic_search or hybrid_search
+4. **Evaluate Results**: Check if results match user needs
+5. **Refine if Needed**: Try different queries or filters if initial results aren't ideal
+6. **Deep Dive**: Use get_document_by_id for promising candidates
+## Best Practices
+- Always explore categories first to understand the data structure
+- Combine semantic understanding with metadata filters for best results
+- If results seem off, try rephrasing the query or adjusting filters
+- Look at multiple candidates before making recommendations
+- Provide clear reasoning about why you selected certain results
+## Example Exploration
+User: "Find donors interested in education in Singapore"
+Your approach:
+1. Call list_available_categories() to confirm "education" is a valid cause and "SG" is a country
+2. Call hybrid_search(query="education donors", country="SG", form_type="donor")
+3. Review results - if they're corporate donors but user wants individuals, refine
+4. Call hybrid_search(query="individual education supporters", country="SG", form_type="donor")
+5. Call get_document_by_id() on top matches for full details
+6. Present findings with explanation
+Always explain your search process and reasoning to the user."""
+class AgenticRAGAgent:
+    """Agent that autonomously explores a vector database using RAG tools.
+    Uses LangGraph's ReAct pattern to iteratively search, filter, and
+    retrieve documents based on user queries.
+    Attributes:
+        llm: The language model for reasoning
+        tools: List of RAG tools for vector store exploration
+        react_agent: The compiled ReAct agent
+        encoder: The embedding encoder
+        vector_store: The vector store instance
+    """
+    def __init__(self, llm, encoder=None, vector_store=None):
+        """Initialize the Agentic RAG Agent.
+        Args:
+            llm: Language model for reasoning and tool use
+            encoder: SeaLion encoder for query embedding (can be set later)
+            vector_store: DonorVectorStore instance (can be set later)
+        """
+        self.llm = llm
+        self.tools = RAG_TOOLS
+        self.encoder = encoder
+        self.vector_store = vector_store
+        # Initialize dependencies if provided
+        if encoder and vector_store:
+            self.set_dependencies(encoder, vector_store)
+        # Create the ReAct agent
+        self.react_agent = create_react_agent(
+            model=llm,
+            tools=self.tools,
+        )
+    def set_dependencies(self, encoder, vector_store):
+        """Set encoder and vector store after initialization.
+        Args:
+            encoder: The SeaLion encoder instance
+            vector_store: The DonorVectorStore instance
+        """
+        self.encoder = encoder
+        self.vector_store = vector_store
+        set_rag_dependencies(encoder, vector_store)
+    async def retrieve_memories(self, store: BaseStore, user_id: str, query: str) -> str:
+        """Fetch relevant memories for this user."""
+        namespace = ("memories", user_id)
+        memories = await store.asearch(namespace, query=query)
+        return "\n".join([d.value.get("data", "") for d in memories])
+    async def store_message(self, store: BaseStore, user_id: str, content: str, role: str):
+        """Store message to memory store."""
+        memory_id = str(uuid.uuid4())
+        namespace = ("memories", user_id)
+        await store.aput(namespace, memory_id, {
+            "data": content,
+            "role": role,
+            "timestamp": datetime.now().isoformat()
+        })
+    async def search(self, query: str, config: Optional[RunnableConfig] = None) -> Dict[str, Any]:
+        """Execute a standalone RAG search without state management.
+        Args:
+            query: The user's search query
+            config: Optional runnable config
+        Returns:
+            Dictionary with 'response' and 'tool_calls' keys
+        """
+        messages = [
+            SystemMessage(content=AGENTIC_RAG_SYSTEM_PROMPT),
+            HumanMessage(content=query)
+        ]
+        result = await self.react_agent.ainvoke(
+            {"messages": messages},
+            config=config
+        )
+        # Extract response and tool call history
+        final_message = result["messages"][-1]
+        # Collect tool calls from message history
+        tool_calls = []
+        for msg in result["messages"]:
+            if hasattr(msg, 'tool_calls') and msg.tool_calls:
+                for tc in msg.tool_calls:
+                    tool_calls.append({
+                        "tool": tc.get("name", "unknown"),
+                        "args": tc.get("args", {})
+                    })
+        return {
+            "response": final_message.content,
+            "tool_calls": tool_calls,
+            "message_count": len(result["messages"])
+        }
+    async def __call__(
+        self,
+        state: dict,
+        config: RunnableConfig,
+        *,
+        store: BaseStore
+    ) -> dict:
+        """Execute the agentic RAG agent as a LangGraph node.
+        This allows the agent to be used within a larger LangGraph workflow.
+        Args:
+            state: Current graph state with messages
+            config: Runnable configuration with user_id etc.
+            store: LangGraph store for memory persistence
+        Returns:
+            Updated state with agent response
+        """
+        last_message = state["messages"][-1]
+        user_id = config["configurable"].get("user_id", "default_user")
+        # Get memories for context
+        memory_info = await self.retrieve_memories(store, user_id, str(last_message.content))
+        # Build messages with system prompt and memory context
+        system_content = AGENTIC_RAG_SYSTEM_PROMPT
+        if memory_info:
+            system_content += f"\n\n## Previous Conversation Context\n{memory_info}"
+        messages = [
+            SystemMessage(content=system_content),
+            HumanMessage(content=last_message.content)
+        ]
+        # Store user message
+        await self.store_message(store, user_id, last_message.content, "user")
+        # Run the ReAct agent with tools
+        result = await self.react_agent.ainvoke({"messages": messages})
+        # Extract the final response
+        final_message = result["messages"][-1]
+        response_content = final_message.content
+        # Store assistant response
+        await self.store_message(store, user_id, response_content, "assistant")
+        return {"messages": [{"role": "assistant", "content": response_content}]}
+def create_agentic_rag_agent(llm, encoder=None, vector_store=None):
+    """Factory function to create an AgenticRAGAgent instance.
+    Args:
+        llm: Language model for reasoning
+        encoder: Optional encoder for query embedding
+        vector_store: Optional vector store instance
+    Returns:
+        Configured AgenticRAGAgent instance
+    """
+    return AgenticRAGAgent(llm, encoder, vector_store)

agents/base.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import uuid
+from datetime import datetime
+from abc import ABC, abstractmethod
+from langgraph.store.base import BaseStore
+from langchain_core.runnables import RunnableConfig
+class BaseMemoryAgent(ABC):
+    """Base class for agents with memory capabilities.
+    Extracts shared logic from therapist_agent and logical_agent:
+    - Memory retrieval from store
+    - Automatic storage of all conversations (user + assistant messages)
+    - Message construction with system prompt + memories
+    - LLM invocation and response formatting
+    """
+    def __init__(self, llm):
+        self.llm = llm
+    @property
+    @abstractmethod
+    def system_prompt(self) -> str:
+        """Each agent defines its own personality/system prompt."""
+        pass
+    async def retrieve_memories(self, store: BaseStore, user_id: str, query: str) -> str:
+        """Fetch relevant memories for this user."""
+        namespace = ("memories", user_id)
+        memories = await store.asearch(namespace, query=query)
+        return "\n".join([d.value.get("data", "") for d in memories])
+    async def store_message(self, store: BaseStore, user_id: str, content: str, role: str):
+        """Store every message to Supabase automatically.
+        Args:
+            store: The LangGraph store instance
+            user_id: User identifier for namespacing
+            content: The message content
+            role: Either 'user' or 'assistant'
+        """
+        memory_id = str(uuid.uuid4())
+        namespace = ("memories", user_id)
+        await store.aput(namespace, memory_id, {
+            "data": content,
+            "role": role,
+            "timestamp": datetime.now().isoformat()
+        })
+    async def __call__(self, state: dict, config: RunnableConfig, *, store: BaseStore) -> dict:
+        """Make the agent callable for LangGraph node compatibility."""
+        last_message = state["messages"][-1]
+        user_id = config["configurable"].get("user_id", "default_user")
+        # Get memories
+        memory_info = await self.retrieve_memories(store, user_id, str(last_message.content))
+        # Build prompt with memories injected
+        full_prompt = f"""{self.system_prompt}
+        User information from previous sessions:
+        {memory_info}"""
+        messages = [
+            {"role": "system", "content": full_prompt},
+            {"role": "user", "content": last_message.content}
+        ]
+        # Store user message automatically
+        await self.store_message(store, user_id, last_message.content, "user")
+        # Get response from LLM
+        reply = self.llm.invoke(messages)
+        # Store assistant response automatically
+        await self.store_message(store, user_id, reply.content, "assistant")
+        return {"messages": [{"role": "assistant", "content": reply.content}]}

agents/charity_search.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Charity Search Agent with web search capabilities."""
+import uuid
+from datetime import datetime
+from langgraph.store.base import BaseStore
+from langgraph.prebuilt import create_react_agent
+from langchain_core.runnables import RunnableConfig
+from langchain_core.messages import HumanMessage, SystemMessage
+from tools.web_search import CHARITY_SEARCH_TOOLS, clear_search_cache
+CHARITY_SEARCH_SYSTEM_PROMPT = """You are a helpful charity research assistant specialized in finding information about nonprofit organizations and charities.
+Your role is to help users:
+1. Find detailed information about specific charity organizations
+2. Research charity ratings and accountability
+3. Discover charities working in specific cause areas
+4. Verify legitimacy and financial transparency of organizations
+IMPORTANT - SEARCH STRATEGY:
+- ALWAYS use the search_charity_comprehensive tool FIRST - it performs a SINGLE optimized search that retrieves mission, programs, ratings, and financial info all at once
+- Only use search_charity_info or search_charity_ratings if you need ADDITIONAL specific details not covered by the comprehensive search
+- AVOID making multiple searches for the same charity - the comprehensive search already covers most needs
+When presenting information:
+- Provide clear, factual information from your search results
+- Include source attribution when possible
+- Give a balanced perspective on the organization
+- Suggest further research if needed
+If you cannot find reliable information, say so clearly and suggest alternative approaches."""
+class CharitySearchAgent:
+    """Agent that searches the web for charity organization information.
+    Uses LangGraph's ReAct pattern with web search tools to find
+    and analyze information about nonprofit organizations.
+    Optimized to minimize redundant web searches by:
+    1. Using a comprehensive search tool that combines multiple queries
+    2. Caching search results to avoid duplicate API calls
+    3. Guiding the LLM to prefer single comprehensive searches
+    """
+    def __init__(self, llm):
+        self.llm = llm
+        self.tools = CHARITY_SEARCH_TOOLS
+        self.react_agent = create_react_agent(
+            model=llm,
+            tools=self.tools,
+        )
+    async def retrieve_memories(self, store: BaseStore, user_id: str, query: str) -> str:
+        """Fetch relevant memories for this user."""
+        namespace = ("memories", user_id)
+        memories = await store.asearch(namespace, query=query)
+        return "\n".join([d.value.get("data", "") for d in memories])
+    async def store_message(self, store: BaseStore, user_id: str, content: str, role: str):
+        """Store message to memory store."""
+        memory_id = str(uuid.uuid4())
+        namespace = ("memories", user_id)
+        await store.aput(namespace, memory_id, {
+            "data": content,
+            "role": role,
+            "timestamp": datetime.now().isoformat()
+        })
+    async def __call__(self, state: dict, config: RunnableConfig, *, store: BaseStore) -> dict:
+        """Execute the charity search agent as a LangGraph node."""
+        last_message = state["messages"][-1]
+        user_id = config["configurable"].get("user_id", "default_user")
+        # Clear search cache at the start of each new query to avoid stale results
+        # but allow caching within the same query execution
+        clear_search_cache()
+        # Get memories for context
+        memory_info = await self.retrieve_memories(store, user_id, str(last_message.content))
+        # Build messages with system prompt and memory context
+        system_content = CHARITY_SEARCH_SYSTEM_PROMPT
+        if memory_info:
+            system_content += f"\n\nPrevious conversation context:\n{memory_info}"
+        messages = [
+            SystemMessage(content=system_content),
+            HumanMessage(content=last_message.content)
+        ]
+        # Store user message
+        await self.store_message(store, user_id, last_message.content, "user")
+        # Run the ReAct agent with tools
+        result = await self.react_agent.ainvoke({"messages": messages})
+        # Extract the final response
+        final_message = result["messages"][-1]
+        response_content = final_message.content
+        # Store assistant response
+        await self.store_message(store, user_id, response_content, "assistant")
+        return {"messages": [{"role": "assistant", "content": response_content}]}
+def create_charity_search_agent(llm):
+    """Factory function to create a CharitySearchAgent instance."""
+    return CharitySearchAgent(llm)

agents/classifier.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from typing import Literal
+from pydantic import BaseModel, Field
+from langgraph.store.base import BaseStore
+from langchain_core.runnables import RunnableConfig
+class MessageClassifier(BaseModel):
+    """Classification result for routing messages."""
+    message_type: Literal["emotional", "logical", "charity_search", "donor_search", "volunteer_search"] = Field(
+        ...,
+        description="Classify message for routing to appropriate agent."
+    )
+async def classify_message(state: dict, config: RunnableConfig, *, store: BaseStore, llm) -> dict:
+    """Classify user message to route to appropriate agent.
+    Args:
+        state: Graph state containing messages
+        config: Runtime config with user_id, thread_id
+        store: Memory store (required by graph but not used here)
+        llm: Language model instance
+    Returns:
+        Dict with message_type for routing
+    """
+    last_message = state["messages"][-1]
+    classifier_llm = llm.with_structured_output(MessageClassifier)
+    result = classifier_llm.invoke([
+        {
+            "role": "system",
+            "content": """Classify the user message into one of these categories:
+Respond ONLY with valid JSON in this exact format:
+{"message_type": "TYPE"}
+Where TYPE is one of:
+- 'emotional': Message requires emotional support, therapy, deals with feelings, or personal problems
+- 'donor_search': Looking for donors in the database, finding people who donate, matching donors by criteria
+- 'volunteer_search': Looking for volunteers in the database, finding people who volunteer, matching volunteers
+- 'charity_search': Asking about charity organizations, nonprofits, wanting to research specific charities
+- 'logical': Facts, information, logical analysis, practical solutions (default for general queries)
+Examples:
+- "Find donors interested in education in Singapore" → donor_search
+- "Show me volunteers with tech skills" → volunteer_search
+- "Tell me about Red Cross charity" → charity_search
+- "I'm feeling sad today" → emotional
+- "What is the capital of France?" → logical"""
+        },
+        {
+            "role": "user",
+            "content": last_message.content
+        }
+    ])
+    return {"message_type": result.message_type}
+def create_classifier(llm):
+    """Factory to create classifier function with LLM bound.
+    Usage:
+        llm = ChatOllama(model="gpt-oss:120b-cloud")
+        classify = create_classifier(llm)
+        graph_builder.add_node("classifier", classify)
+    """
+    async def classifier_node(state: dict, config: RunnableConfig, *, store: BaseStore):
+        return await classify_message(state, config, store=store, llm=llm)
+    return classifier_node

agents/logical.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from .base import BaseMemoryAgent
+class LogicalAgent(BaseMemoryAgent):
+    """Logical/factual response agent."""
+    @property
+    def system_prompt(self) -> str:
+        return """You are a helpful assistant for a charity matching platform. Focus on providing clear, factual information about donors, charities, and philanthropy.
+**Your role:**
+- Provide accurate information about donor matching and charity recommendations
+- Answer questions about causes, giving strategies, and impact
+- Help users understand data and insights from the platform
+**Response formatting guidelines:**
+- Use **bold** for important terms or key points
+- Use bullet points (- ) for listing features, options, or facts
+- Use numbered lists (1. 2. 3.) for sequences or ranked items
+- Keep paragraphs short (2-3 sentences max)
+- Add blank lines between sections for readability
+- Use headers with **Bold Text** when covering multiple topics
+**Structure your responses:**
+1. Start with a direct answer to the question
+2. Provide supporting details or context
+3. End with actionable next steps if applicable
+**Keep responses focused and concise - aim for 3-5 short paragraphs maximum.**"""

agents/therapist.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from .base import BaseMemoryAgent
+class TherapistAgent(BaseMemoryAgent):
+    """Emotional/therapeutic response agent."""
+    @property
+    def system_prompt(self) -> str:
+        return """You are a compassionate therapist assistant for a charity matching platform. Focus on the emotional aspects of the user's message.
+**Your approach:**
+- Show empathy and validate their feelings
+- Help them process their emotions about giving and philanthropy
+- Ask thoughtful questions to explore their motivations for helping others
+**Response formatting guidelines:**
+- Use **bold** for emphasis on key points
+- Use bullet points (- ) for lists of suggestions or ideas
+- Use numbered lists (1. 2. 3.) for step-by-step guidance
+- Keep paragraphs short and readable (2-3 sentences max)
+- Add a blank line between sections for clarity
+- End with an encouraging note or thoughtful question
+**Keep responses concise but warm - aim for 3-5 short paragraphs maximum.**"""

app.py ADDED Viewed

	@@ -0,0 +1,1817 @@

+"""
+FastAPI endpoints for Ollama chat and donor/volunteer recommendation system.
+Endpoints:
+- /chat: Chat with Ollama model using LangGraph with memory
+- /donors/register: Register a donor and generate embedding
+- /volunteers/register: Register a volunteer and generate embedding
+- /donors/recommend: Find similar donors based on query
+- /volunteers/recommend: Find similar volunteers based on query
+- /forms/{id}: Get/Delete a stored form
+- /forms/stats: Get form counts by type
+"""
+import os
+import sys
+import asyncio
+from contextlib import asynccontextmanager
+from typing import Optional, List, Dict, Any
+# Add app directory to path for local module imports
+APP_DIR = os.path.dirname(os.path.abspath(__file__))
+if APP_DIR not in sys.path:
+    sys.path.insert(0, APP_DIR)
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+# Windows-specific fix for psycopg async compatibility
+if sys.platform == "win32":
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+# Load .env file for local development
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+# Lazy imports for encoder/recommender (avoid import errors if deps missing)
+encoder = None
+vector_store = None
+pool = None
+gis_recommender = None
+# ============================================================================
+# Pydantic Models
+# ============================================================================
+class ChatResponse(BaseModel):
+    response: str
+class DonorFormRequest(BaseModel):
+    """Donor registration form."""
+    id: str = Field(..., description="Unique identifier for the donor")
+    name: str = Field(..., description="Donor name")
+    donor_type: str = Field(..., description="Type: individual, corporate, foundation")
+    country: str = Field(..., description="ASEAN country code (SG, MY, TH, VN, ID, PH, etc.)")
+    preferred_language: str = Field(..., description="Primary language code")
+    causes: List[str] = Field(default_factory=list, description="Interested causes")
+    donation_frequency: Optional[str] = Field(None, description="one-time, monthly, quarterly, annual")
+    amount_range: Optional[str] = Field(None, description="Preferred donation range")
+    bio: Optional[str] = Field(None, description="Donor background")
+    motivation: Optional[str] = Field(None, description="Why they want to donate")
+class VolunteerFormRequest(BaseModel):
+    """Volunteer registration form."""
+    id: str = Field(..., description="Unique identifier for the volunteer")
+    name: str = Field(..., description="Volunteer name")
+    volunteer_type: str = Field(..., description="Type: regular, event_based, skilled")
+    country: str = Field(..., description="ASEAN country code")
+    preferred_language: str = Field(..., description="Primary language code")
+    languages_spoken: List[str] = Field(default_factory=list, description="All languages spoken")
+    skills: List[str] = Field(default_factory=list, description="Professional/technical skills")
+    availability: str = Field(..., description="weekends, evenings, flexible, full_time")
+    causes: List[str] = Field(default_factory=list, description="Interested causes")
+    experience: Optional[str] = Field(None, description="Prior volunteer experience")
+    goals: Optional[str] = Field(None, description="What they hope to achieve")
+class RecommendRequest(BaseModel):
+    """Request for recommendations based on a query form."""
+    # Either provide a form_id to use existing embedding, or provide form data
+    form_id: Optional[str] = Field(None, description="Existing form ID to use as query")
+    # Or provide inline form data
+    country: Optional[str] = None
+    preferred_language: Optional[str] = None
+    causes: List[str] = Field(default_factory=list)
+    bio: Optional[str] = None
+    motivation: Optional[str] = None
+    # Search options
+    limit: int = Field(default=10, ge=1, le=50)
+    country_filter: Optional[str] = None
+    exclude_ids: List[str] = Field(default_factory=list)
+class FormResponse(BaseModel):
+    """Response for form operations."""
+    id: str
+    form_type: str
+    message: str
+    embedding_dimension: Optional[int] = None
+class ClientProfileRequest(BaseModel):
+    """Client profile with spatial and behavioral data."""
+    user_id: str
+    coordinates: List[float] = Field(
+        default=[1.3521, 103.8198], description="[lat, lng]"
+    )
+    planning_area: str = Field(default="central", description="Singapore planning area")
+    housing_type: str = Field(
+        default="hdb_4_room", description="Housing type for income proxy"
+    )
+    interests: List[str] = Field(default_factory=list)
+    causes: List[str] = Field(default_factory=list)
+    preferred_language: str = Field(default="en")
+    is_donor: bool = False
+    total_donated: float = 0.0
+    donation_count: int = 0
+    age_range: Optional[str] = None
+class LookalikeRequest(BaseModel):
+    """Request for lookalike client search."""
+    seed_causes: List[str] = Field(..., description="Causes to find lookalikes for")
+    seed_interests: List[str] = Field(default_factory=list)
+    planning_area_filter: Optional[str] = Field(
+        None, description="Geo-fence by planning area"
+    )
+    housing_type_filter: Optional[List[str]] = Field(
+        None, description="Filter by housing types"
+    )
+    limit: int = Field(default=50, ge=1, le=200)
+    min_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    include_geojson: bool = Field(
+        default=True, description="Include GeoJSON for mapping"
+    )
+class ScoredClientResponse(BaseModel):
+    """Single scored client result."""
+    user_id: str
+    planning_area: str
+    housing_type: str
+    causes: List[str]
+    interests: List[str]
+    is_donor: bool
+    final_score: float
+    vector_similarity: float
+    spatial_proxy: float
+    proximity: float
+    coordinates: Optional[List[float]] = None  # Reduced precision
+class LookalikeResponse(BaseModel):
+    """Response containing lookalike clients with optional GeoJSON."""
+    seed_causes: List[str]
+    total_found: int
+    tiers: Dict[str, List[ScoredClientResponse]]
+    geojson: Optional[Dict[str, Any]] = None
+class SingpassMockData(BaseModel):
+    """Mock Singpass data for autofill."""
+    name: str
+    nric_masked: str
+    email: str
+    mobile: str
+    registered_address: str
+    planning_area: str
+    organization_name: Optional[str] = None
+    organization_uen: Optional[str] = None
+    organization_type: Optional[str] = None
+class RecommendationResult(BaseModel):
+    """Single recommendation result."""
+    id: str
+    form_type: str
+    score: float
+    distance: float
+    form_data: Dict[str, Any]
+class RecommendResponse(BaseModel):
+    """Response containing recommendations."""
+    query_id: Optional[str]
+    results: List[RecommendationResult]
+    total_found: int
+class StatsResponse(BaseModel):
+    """Form statistics response."""
+    donor: int
+    volunteer: int
+    total: int
+# ============================================================================
+# Database & Encoder Setup
+# ============================================================================
+async def init_services():
+    """Initialize encoder and database connection."""
+    global encoder, vector_store, pool, gis_recommender
+    try:
+        from encoders.sealion import SeaLionEncoder
+        from recommender.vector_store import DonorVectorStore
+        from recommender.gis_recommender import GISRecommender
+        from psycopg_pool import AsyncConnectionPool
+        # Initialize encoder (reads SEALION_ENDPOINT from env)
+        encoder = SeaLionEncoder()
+        # Build connection string from env vars
+        db_host = os.getenv("SUPABASE_DB_HOST")
+        db_port = os.getenv("SUPABASE_DB_PORT", "6543")
+        db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
+        db_user = os.getenv("SUPABASE_DB_USER")
+        db_password = os.getenv("SUPABASE_DB_PASSWORD")
+        db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
+        if db_host and db_user and db_password:
+            conn_string = (
+                f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
+                f"?sslmode={db_sslmode}"
+            )
+            pool = AsyncConnectionPool(
+                conninfo=conn_string,
+                max_size=10,
+                kwargs={"autocommit": True, "prepare_threshold": None},
+            )
+            await pool.open()
+            vector_store = DonorVectorStore(pool)
+            gis_recommender = GISRecommender(vector_store=vector_store, encoder=encoder)
+            print("[OK] Database connection pool initialized")
+            print("[OK] GIS Recommender initialized")
+        else:
+            print("[WARN] Database credentials not configured, vector store disabled")
+        print("[OK] SeaLion encoder initialized")
+    except Exception as e:
+        print(f"[WARN] Service initialization error: {e}")
+        print("  Some endpoints may not be available")
+async def close_services():
+    """Close database connections."""
+    global pool
+    if pool:
+        await pool.close()
+        print("[OK] Database connection pool closed")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager for startup/shutdown."""
+    await init_services()
+    await init_langgraph()
+    yield
+    await close_services()
+# ============================================================================
+# FastAPI App
+# ============================================================================
+app = FastAPI(
+    title="Donor Recommendation API",
+    description="API for chat, donor/volunteer registration, and recommendations",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ============================================================================
+# LangGraph Chat Setup
+# ============================================================================
+# Global graph instance (initialized at startup)
+langgraph_chat = None
+async def init_langgraph():
+    """Initialize LangGraph with memory."""
+    global langgraph_chat
+    try:
+        from graph.builder import build_graph_with_memory
+        graph, _, _ = await build_graph_with_memory()
+        langgraph_chat = graph
+        print("[OK] LangGraph chat with memory initialized")
+    except Exception as e:
+        import traceback
+        print(f"[WARN] LangGraph initialization error: {e}")
+        traceback.print_exc()
+        print("  /chat endpoint may not be available")
+# ============================================================================
+# Helper Functions
+# ============================================================================
+def donor_form_to_text(form: DonorFormRequest) -> str:
+    """Convert donor form to encoding text."""
+    parts = [
+        f"Donor type: {form.donor_type}",
+        f"Country: {form.country}",
+        f"Preferred language: {form.preferred_language}",
+    ]
+    if form.causes:
+        parts.append(f"Causes interested in: {', '.join(form.causes)}")
+    if form.donation_frequency:
+        parts.append(f"Donation frequency: {form.donation_frequency}")
+    if form.amount_range:
+        parts.append(f"Amount range: {form.amount_range}")
+    if form.bio:
+        parts.append(f"Bio: {form.bio}")
+    if form.motivation:
+        parts.append(f"Motivation: {form.motivation}")
+    return "\n".join(parts)
+def volunteer_form_to_text(form: VolunteerFormRequest) -> str:
+    """Convert volunteer form to encoding text."""
+    parts = [
+        f"Volunteer type: {form.volunteer_type}",
+        f"Country: {form.country}",
+        f"Preferred language: {form.preferred_language}",
+    ]
+    if form.languages_spoken:
+        parts.append(f"Languages spoken: {', '.join(form.languages_spoken)}")
+    if form.skills:
+        parts.append(f"Skills: {', '.join(form.skills)}")
+    parts.append(f"Availability: {form.availability}")
+    if form.causes:
+        parts.append(f"Causes interested in: {', '.join(form.causes)}")
+    if form.experience:
+        parts.append(f"Experience: {form.experience}")
+    if form.goals:
+        parts.append(f"Goals: {form.goals}")
+    return "\n".join(parts)
+def recommend_request_to_text(req: RecommendRequest) -> str:
+    """Convert recommendation request to encoding text."""
+    parts = []
+    if req.country:
+        parts.append(f"Country: {req.country}")
+    if req.preferred_language:
+        parts.append(f"Preferred language: {req.preferred_language}")
+    if req.causes:
+        parts.append(f"Causes interested in: {', '.join(req.causes)}")
+    if req.bio:
+        parts.append(f"Bio: {req.bio}")
+    if req.motivation:
+        parts.append(f"Motivation: {req.motivation}")
+    return "\n".join(parts) if parts else "General query"
+# ============================================================================
+# Health Endpoints
+# ============================================================================
+@app.get("/")
+def root():
+    """Root endpoint with service status."""
+    return {
+        "status": "healthy",
+        "message": "Donor Recommendation API is running",
+        "services": {
+            "langgraph_chat": langgraph_chat is not None,
+            "encoder": encoder is not None,
+            "database": vector_store is not None,
+        }
+    }
+@app.get("/health")
+def health():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+# ============================================================================
+# Chat Endpoints
+# ============================================================================
+class ChatRequestWithMemory(BaseModel):
+    message: str
+    user_id: str = "default_user"
+    thread_id: str = "default_thread"
+    stream: bool = False
+@app.post("/chat")
+async def chat(request: ChatRequestWithMemory):
+    """Chat with LangGraph-powered chatbot with memory."""
+    if not langgraph_chat:
+        raise HTTPException(
+            status_code=503,
+            detail="LangGraph chat not initialized. Check server logs."
+        )
+    config = {
+        "configurable": {
+            "thread_id": request.thread_id,
+            "user_id": request.user_id,
+        }
+    }
+    try:
+        if request.stream:
+            async def generate_stream():
+                async for chunk in langgraph_chat.astream(
+                    {"messages": [{"role": "user", "content": request.message}]},
+                    config,
+                    stream_mode="values",
+                ):
+                    if chunk.get("messages"):
+                        last_msg = chunk["messages"][-1]
+                        if hasattr(last_msg, 'content') and last_msg.type == 'ai':
+                            yield last_msg.content
+            return StreamingResponse(
+                generate_stream(),
+                media_type="text/event-stream"
+            )
+        else:
+            # Non-streaming: collect full response
+            response_content = ""
+            async for chunk in langgraph_chat.astream(
+                {"messages": [{"role": "user", "content": request.message}]},
+                config,
+                stream_mode="values",
+            ):
+                if chunk.get("messages"):
+                    last_msg = chunk["messages"][-1]
+                    if hasattr(last_msg, 'content') and last_msg.type == 'ai':
+                        response_content = last_msg.content
+            return ChatResponse(response=response_content)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# Agentic RAG Endpoints
+# ============================================================================
+# Global agentic RAG agent instance
+agentic_rag_agent = None
+class AgenticRAGRequest(BaseModel):
+    """Request for Agentic RAG search."""
+    query: str = Field(..., description="Natural language query for donor/volunteer search")
+    max_iterations: int = Field(default=10, ge=1, le=20, description="Max tool call iterations")
+class AgenticRAGResponse(BaseModel):
+    """Response from Agentic RAG search."""
+    response: str
+    tool_calls: List[Dict[str, Any]]
+    message_count: int
+async def init_agentic_rag():
+    """Initialize the Agentic RAG agent."""
+    global agentic_rag_agent
+    if encoder is None or vector_store is None:
+        print("[WARN] Cannot initialize Agentic RAG: encoder or vector_store not available")
+        return
+    try:
+        from agents.agentic_rag import AgenticRAGAgent
+        from langchain_ollama import ChatOllama
+        # Create LLM for the agent
+        api_key = os.getenv('OLLAMA_API_KEY')
+        if api_key:
+            llm = ChatOllama(
+                model="gpt-oss:120b",
+                base_url="https://ollama.com",
+                client_kwargs={
+                    "headers": {"Authorization": f"Bearer {api_key}"}
+                }
+            )
+        else:
+            llm = ChatOllama(model="gpt-oss:120b-cloud")
+        agentic_rag_agent = AgenticRAGAgent(llm, encoder, vector_store)
+        print("[OK] Agentic RAG agent initialized")
+    except Exception as e:
+        import traceback
+        print(f"[WARN] Agentic RAG initialization error: {e}")
+        traceback.print_exc()
+@app.post("/rag/search", response_model=AgenticRAGResponse)
+async def agentic_rag_search(request: AgenticRAGRequest):
+    """
+    Agentic RAG search - the agent autonomously explores the vector store.
+    The agent will:
+    1. Analyze your query to understand what you're looking for
+    2. Explore available categories in the database
+    3. Perform semantic and/or filtered searches
+    4. Iteratively refine results if needed
+    5. Return detailed findings with reasoning
+    Example queries:
+    - "Find donors interested in education in Singapore"
+    - "Show me corporate donors who focus on environmental causes"
+    - "Find volunteers with tech skills available on weekends"
+    """
+    global agentic_rag_agent
+    # Lazy initialization if not done yet
+    if agentic_rag_agent is None:
+        await init_agentic_rag()
+    if agentic_rag_agent is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Agentic RAG not available. Ensure encoder and database are configured."
+        )
+    try:
+        result = await agentic_rag_agent.search(request.query)
+        return AgenticRAGResponse(
+            response=result["response"],
+            tool_calls=result["tool_calls"],
+            message_count=result["message_count"]
+        )
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/rag/tools")
+async def list_rag_tools():
+    """List available RAG tools and their descriptions."""
+    from tools.rag_tools import RAG_TOOLS
+    tools_info = []
+    for tool in RAG_TOOLS:
+        tools_info.append({
+            "name": tool.name,
+            "description": tool.description,
+        })
+    return {
+        "tools": tools_info,
+        "total": len(tools_info)
+    }
+@app.get("/rag/categories")
+async def get_rag_categories():
+    """Get available categories in the vector store for filtering."""
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    from tools.rag_tools import list_available_categories, set_rag_dependencies
+    # Ensure dependencies are set
+    if encoder and vector_store:
+        set_rag_dependencies(encoder, vector_store)
+    try:
+        result = await list_available_categories.ainvoke({})
+        import json
+        return json.loads(result)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# Donor Endpoints
+# ============================================================================
+@app.post("/donors/register", response_model=FormResponse)
+async def register_donor(form: DonorFormRequest):
+    """Register a donor and generate embedding."""
+    if not encoder:
+        raise HTTPException(status_code=503, detail="Encoder not initialized")
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        # Convert form to encoding text
+        text = donor_form_to_text(form)
+        # Generate embedding
+        embedding = await encoder.encode(text)
+        # Store in database
+        form_data = form.model_dump()
+        await vector_store.store_embedding(
+            form_id=form.id,
+            form_type="donor",
+            embedding=embedding,
+            form_data=form_data
+        )
+        return FormResponse(
+            id=form.id,
+            form_type="donor",
+            message="Donor registered successfully",
+            embedding_dimension=len(embedding)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/donors/recommend", response_model=RecommendResponse)
+async def recommend_donors(request: RecommendRequest):
+    """Find similar donors based on query."""
+    if not encoder:
+        raise HTTPException(status_code=503, detail="Encoder not initialized")
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        # Get query embedding
+        if request.form_id:
+            # Use existing form's embedding
+            existing = await vector_store.get_embedding(request.form_id)
+            if not existing:
+                raise HTTPException(status_code=404, detail=f"Form {request.form_id} not found")
+            # Re-encode for query (could also store raw embedding)
+            text = recommend_request_to_text(request)
+            query_embedding = await encoder.encode(text)
+        else:
+            # Generate new embedding from request data
+            text = recommend_request_to_text(request)
+            query_embedding = await encoder.encode(text)
+        # Find similar donors
+        results = await vector_store.find_similar(
+            query_embedding=query_embedding,
+            form_type="donor",
+            limit=request.limit,
+            country_filter=request.country_filter,
+            exclude_ids=request.exclude_ids if request.exclude_ids else None
+        )
+        return RecommendResponse(
+            query_id=request.form_id,
+            results=[
+                RecommendationResult(
+                    id=r.id,
+                    form_type=r.form_type,
+                    score=r.score,
+                    distance=r.distance,
+                    form_data=r.form_data
+                )
+                for r in results
+            ],
+            total_found=len(results)
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# Volunteer Endpoints
+# ============================================================================
+@app.post("/volunteers/register", response_model=FormResponse)
+async def register_volunteer(form: VolunteerFormRequest):
+    """Register a volunteer and generate embedding."""
+    if not encoder:
+        raise HTTPException(status_code=503, detail="Encoder not initialized")
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        # Convert form to encoding text
+        text = volunteer_form_to_text(form)
+        # Generate embedding
+        embedding = await encoder.encode(text)
+        # Store in database
+        form_data = form.model_dump()
+        await vector_store.store_embedding(
+            form_id=form.id,
+            form_type="volunteer",
+            embedding=embedding,
+            form_data=form_data
+        )
+        return FormResponse(
+            id=form.id,
+            form_type="volunteer",
+            message="Volunteer registered successfully",
+            embedding_dimension=len(embedding)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/volunteers/recommend", response_model=RecommendResponse)
+async def recommend_volunteers(request: RecommendRequest):
+    """Find similar volunteers based on query."""
+    if not encoder:
+        raise HTTPException(status_code=503, detail="Encoder not initialized")
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        # Generate query embedding
+        text = recommend_request_to_text(request)
+        query_embedding = await encoder.encode(text)
+        # Find similar volunteers
+        results = await vector_store.find_similar(
+            query_embedding=query_embedding,
+            form_type="volunteer",
+            limit=request.limit,
+            country_filter=request.country_filter,
+            exclude_ids=request.exclude_ids if request.exclude_ids else None
+        )
+        return RecommendResponse(
+            query_id=request.form_id,
+            results=[
+                RecommendationResult(
+                    id=r.id,
+                    form_type=r.form_type,
+                    score=r.score,
+                    distance=r.distance,
+                    form_data=r.form_data
+                )
+                for r in results
+            ],
+            total_found=len(results)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# Form Management Endpoints
+# ============================================================================
+@app.get("/forms/{form_id}")
+async def get_form(form_id: str):
+    """Get a stored form by ID."""
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    result = await vector_store.get_embedding(form_id)
+    if not result:
+        raise HTTPException(status_code=404, detail=f"Form {form_id} not found")
+    return {
+        "id": result.id,
+        "form_type": result.form_type,
+        "form_data": result.form_data
+    }
+@app.delete("/forms/{form_id}")
+async def delete_form(form_id: str):
+    """Delete a form by ID."""
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    deleted = await vector_store.delete_embedding(form_id)
+    if not deleted:
+        raise HTTPException(status_code=404, detail=f"Form {form_id} not found")
+    return {"message": f"Form {form_id} deleted successfully"}
+@app.get("/forms/stats/summary", response_model=StatsResponse)
+async def get_form_stats():
+    """Get form counts by type."""
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        counts = await vector_store.count_by_type()
+        return StatsResponse(
+            donor=counts.get("donor", 0),
+            volunteer=counts.get("volunteer", 0),
+            total=counts.get("total", 0)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# Cause-based Search Endpoint
+# ============================================================================
+@app.post("/forms/search/causes")
+async def search_by_causes(
+    causes: List[str],
+    limit: int = 20
+):
+    """Search forms by causes with embedding ranking."""
+    if not encoder:
+        raise HTTPException(status_code=503, detail="Encoder not initialized")
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        # Create a synthetic query embedding for ranking
+        query_text = f"Causes interested in: {', '.join(causes)}"
+        query_embedding = await encoder.encode(query_text)
+        results = await vector_store.find_by_causes(
+            target_causes=causes,
+            query_embedding=query_embedding,
+            limit=limit
+        )
+        return {
+            "causes": causes,
+            "results": [
+                {
+                    "id": r.id,
+                    "form_type": r.form_type,
+                    "score": r.score,
+                    "distance": r.distance,
+                    "form_data": r.form_data
+                }
+                for r in results
+            ],
+            "total_found": len(results)
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# GIS & Client Targeting Endpoints
+# ============================================================================
+# Mock Singpass data for different organization profiles
+MOCK_SINGPASS_PROFILES = {
+    "org_001": SingpassMockData(
+        name="Sarah Tan Wei Ling",
+        nric_masked="S****567A",
+        email="sarah.tan@example.org",
+        mobile="+65 9123 4567",
+        registered_address="123 Orchard Road, #12-01, Singapore 238867",
+        planning_area="orchard",
+        organization_name="Hearts of Hope Foundation",
+        organization_uen="201912345K",
+        organization_type="charity",
+    ),
+    "org_002": SingpassMockData(
+        name="Ahmad bin Ibrahim",
+        nric_masked="S****234B",
+        email="ahmad.ibrahim@greensg.org",
+        mobile="+65 9876 5432",
+        registered_address="45 Jurong East Ave 1, #05-12, Singapore 609788",
+        planning_area="jurong_east",
+        organization_name="Green Singapore Initiative",
+        organization_uen="201823456M",
+        organization_type="ngo",
+    ),
+    "org_003": SingpassMockData(
+        name="Lee Mei Hua",
+        nric_masked="S****789C",
+        email="meihua@eldercare.sg",
+        mobile="+65 8765 4321",
+        registered_address="78 Toa Payoh Lorong 1, #08-22, Singapore 310078",
+        planning_area="toa_payoh",
+        organization_name="ElderCare Singapore",
+        organization_uen="200934567N",
+        organization_type="social_enterprise",
+    ),
+}
+@app.get("/singpass/mock/{profile_id}", response_model=SingpassMockData)
+async def get_singpass_mock_data(profile_id: str):
+    """
+    Get mock Singpass data for autofill demonstration.
+    Available profiles: org_001, org_002, org_003
+    """
+    if profile_id not in MOCK_SINGPASS_PROFILES:
+        # Return a random profile if not found
+        profile_id = "org_001"
+    return MOCK_SINGPASS_PROFILES[profile_id]
+@app.get("/singpass/mock", response_model=Dict[str, SingpassMockData])
+async def list_singpass_mock_profiles():
+    """List all available mock Singpass profiles."""
+    return MOCK_SINGPASS_PROFILES
+@app.get("/planning-areas")
+async def get_planning_areas():
+    """Get all Singapore planning areas with coordinates."""
+    from recommender.gis_recommender import PLANNING_AREAS
+    return PLANNING_AREAS
+@app.get("/housing-types")
+async def get_housing_types():
+    """Get all housing types with income proxy scores."""
+    from recommender.gis_recommender import HOUSING_INCOME_PROXY, HousingType
+    return {
+        "types": [h.value for h in HousingType],
+        "income_proxy": {h.value: score for h, score in HOUSING_INCOME_PROXY.items()},
+    }
+@app.post("/clients/register", response_model=FormResponse)
+async def register_client(profile: ClientProfileRequest):
+    """
+    Register a client profile with spatial and behavioral data.
+    This creates an embedding combining interests/causes with spatial context.
+    """
+    if not encoder:
+        raise HTTPException(status_code=503, detail="Encoder not initialized")
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        from recommender.gis_recommender import ClientProfile, HousingType
+        # Create client profile
+        client = ClientProfile(
+            user_id=profile.user_id,
+            coordinates=tuple(profile.coordinates),
+            planning_area=profile.planning_area,
+            housing_type=HousingType(profile.housing_type),
+            interests=profile.interests,
+            causes=profile.causes,
+            preferred_language=profile.preferred_language,
+            is_donor=profile.is_donor,
+            total_donated=profile.total_donated,
+            donation_count=profile.donation_count,
+            age_range=profile.age_range,
+        )
+        # Generate embedding
+        text = client.to_embedding_text()
+        embedding = await encoder.encode(text)
+        # Store in database
+        form_data = client.to_dict()
+        form_data["country"] = "SG"  # For existing filter compatibility
+        await vector_store.store_embedding(
+            form_id=profile.user_id,
+            form_type="client",
+            embedding=embedding,
+            form_data=form_data,
+        )
+        return FormResponse(
+            id=profile.user_id,
+            form_type="client",
+            message="Client profile registered successfully",
+            embedding_dimension=len(embedding),
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/clients/lookalike", response_model=LookalikeResponse)
+async def find_lookalike_clients(request: LookalikeRequest):
+    """
+    Find lookalike clients (potential donors) based on a seed profile.
+    This uses the GIS recommender with hybrid semantic-spatial matching:
+    1. Find registered donors from database via vector search
+    2. Apply spatial/housing filters
+    3. Score using tiered targeting (vector + spatial proxy + proximity)
+    4. Fall back to mock data if database has insufficient results
+    5. Return results with optional GeoJSON for mapping
+    Note: Searches BOTH donors (from /donors/register) and clients
+    (from /clients/register) to find potential matches.
+    """
+    try:
+        from recommender.gis_recommender import (
+            ClientProfile,
+            HousingType,
+            GISRecommender,
+            generate_seed_donor_profile,
+            generate_mock_clients,
+        )
+        # Create seed profile from request
+        seed = generate_seed_donor_profile(
+            cause=request.seed_causes[0] if request.seed_causes else "education"
+        )
+        seed.causes = request.seed_causes
+        seed.interests = request.seed_interests
+        # Update seed coordinates if planning area specified
+        if request.planning_area_filter:
+            from recommender.gis_recommender import PLANNING_AREAS
+            if request.planning_area_filter in PLANNING_AREAS:
+                area = PLANNING_AREAS[request.planning_area_filter]
+                seed.coordinates = (area["lat"], area["lng"])
+                seed.planning_area = request.planning_area_filter
+        # Regenerate embeddings for updated seed
+        seed.embedding = None  # Force regeneration
+        local_recommender = GISRecommender()
+        seed.embedding = local_recommender._generate_fallback_embedding(seed)
+        seed.compute_reduced_embeddings()
+        # Convert housing type filter
+        housing_filter = None
+        if request.housing_type_filter:
+            housing_filter = [HousingType(h) for h in request.housing_type_filter]
+        scored_clients = []
+        db_results_count = 0
+        # Try database search first if available
+        if gis_recommender and encoder and vector_store:
+            try:
+                print(
+                    f"Searching database for donors matching causes: {request.seed_causes}"
+                )
+                scored_clients = await gis_recommender.find_lookalikes(
+                    seed_profile=seed,
+                    k=request.limit * 2,  # Get more to allow for filtering
+                    planning_area_filter=None,  # Remove strict filter for DB search
+                    housing_type_filter=None,  # Filter after retrieval
+                    use_hybrid=False,
+                )
+                db_results_count = len(scored_clients)
+                print(f"Found {db_results_count} donors/clients from database")
+                # Apply filters after retrieval for more flexible matching
+                if request.planning_area_filter:
+                    scored_clients = [
+                        sc
+                        for sc in scored_clients
+                        if sc.client.planning_area == request.planning_area_filter
+                    ]
+                if housing_filter:
+                    scored_clients = [
+                        sc
+                        for sc in scored_clients
+                        if sc.client.housing_type in housing_filter
+                    ]
+            except Exception as e:
+                print(f"Database search failed: {e}")
+                import traceback
+                traceback.print_exc()
+        # If insufficient results from database, supplement with mock data
+        min_results = max(request.limit // 2, 10)  # At least half the requested or 10
+        if len(scored_clients) < min_results:
+            print(f"Only {len(scored_clients)} from DB, supplementing with mock data")
+            # Generate mock candidates
+            fallback_candidates = generate_mock_clients(150)
+            # Filter by causes for relevance
+            if request.seed_causes:
+                cause_matched = [
+                    c
+                    for c in fallback_candidates
+                    if any(cause in c.causes for cause in request.seed_causes)
+                ]
+                if len(cause_matched) >= 20:
+                    fallback_candidates = cause_matched
+            # Use hybrid matching on mock data
+            mock_results = local_recommender.find_lookalikes_hybrid(
+                seed_profile=seed,
+                candidates=fallback_candidates,
+                k=request.limit - len(scored_clients),
+                planning_area_filter=request.planning_area_filter,
+                housing_type_filter=housing_filter,
+            )
+            scored_clients.extend(mock_results)
+            print(
+                f"Added {len(mock_results)} mock results, total: {len(scored_clients)}"
+            )
+        # Sort combined results by score
+        scored_clients.sort(key=lambda x: x.final_score, reverse=True)
+        scored_clients = scored_clients[: request.limit]
+        # Apply tiered targeting with relaxed min_score for small datasets
+        effective_min_score = max(0, request.min_score - 0.1)  # Relax slightly
+        tiered = local_recommender.apply_tiered_targeting(
+            scored_clients, min_score=effective_min_score
+        )
+        # Convert to response format
+        def to_response(sc):
+            return ScoredClientResponse(
+                user_id=sc.client.user_id,
+                planning_area=sc.client.planning_area,
+                housing_type=sc.client.housing_type.value,
+                causes=sc.client.causes,
+                interests=sc.client.interests,
+                is_donor=sc.client.is_donor,
+                final_score=round(sc.final_score, 3),
+                vector_similarity=round(sc.vector_similarity_score, 3),
+                spatial_proxy=round(sc.spatial_proxy_score, 3),
+                proximity=round(sc.proximity_score, 3),
+                coordinates=(
+                    list(sc.client.coordinates) if request.include_geojson else None
+                ),
+            )
+        tiers_response = {
+            "tier_1": [to_response(sc) for sc in tiered["tier_1"]],
+            "tier_2": [to_response(sc) for sc in tiered["tier_2"]],
+            "tier_3": [to_response(sc) for sc in tiered["tier_3"]],
+        }
+        # Generate GeoJSON if requested
+        geojson = None
+        if request.include_geojson:
+            all_clients = tiered["tier_1"] + tiered["tier_2"] + tiered["tier_3"]
+            geojson = local_recommender.to_geojson(all_clients)
+        total = sum(len(t) for t in tiered.values())
+        return LookalikeResponse(
+            seed_causes=request.seed_causes,
+            total_found=total,
+            tiers=tiers_response,
+            geojson=geojson,
+        )
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+async def _get_mock_lookalike_response(request: LookalikeRequest) -> LookalikeResponse:
+    """Generate mock lookalike response when GIS recommender unavailable."""
+    from recommender.gis_recommender import (
+        generate_mock_clients,
+        PLANNING_AREAS,
+        HOUSING_INCOME_PROXY,
+        HousingType,
+    )
+    # Generate mock clients
+    mock_clients = generate_mock_clients(100)
+    # Filter by causes
+    filtered = [
+        c
+        for c in mock_clients
+        if any(cause in c.causes for cause in request.seed_causes)
+    ]
+    # Apply planning area filter
+    if request.planning_area_filter:
+        filtered = [
+            c for c in filtered if c.planning_area == request.planning_area_filter
+        ]
+    # Score and sort
+    scored = []
+    for client in filtered[: request.limit]:
+        # Calculate mock scores
+        cause_match = len(set(client.causes) & set(request.seed_causes)) / max(
+            len(request.seed_causes), 1
+        )
+        spatial_score = HOUSING_INCOME_PROXY.get(client.housing_type, 0.5)
+        final_score = 0.5 * cause_match + 0.3 * spatial_score + 0.2 * 0.5
+        scored.append(
+            {
+                "client": client,
+                "final_score": final_score,
+                "vector_similarity": cause_match,
+                "spatial_proxy": spatial_score,
+                "proximity": 0.5,
+            }
+        )
+    scored.sort(key=lambda x: x["final_score"], reverse=True)
+    # Apply min score filter
+    scored = [s for s in scored if s["final_score"] >= request.min_score]
+    # Create tiers
+    n = len(scored)
+    tier_size = max(n // 3, 1)
+    def to_response(s):
+        c = s["client"]
+        return ScoredClientResponse(
+            user_id=c.user_id,
+            planning_area=c.planning_area,
+            housing_type=c.housing_type.value,
+            causes=c.causes,
+            interests=c.interests,
+            is_donor=c.is_donor,
+            final_score=round(s["final_score"], 3),
+            vector_similarity=round(s["vector_similarity"], 3),
+            spatial_proxy=round(s["spatial_proxy"], 3),
+            proximity=round(s["proximity"], 3),
+            coordinates=list(c.coordinates) if request.include_geojson else None,
+        )
+    tiers = {
+        "tier_1": [to_response(s) for s in scored[:tier_size]],
+        "tier_2": [to_response(s) for s in scored[tier_size : tier_size * 2]],
+        "tier_3": [to_response(s) for s in scored[tier_size * 2 :]],
+    }
+    # Generate GeoJSON
+    geojson = None
+    if request.include_geojson:
+        features = []
+        for s in scored:
+            c = s["client"]
+            features.append(
+                {
+                    "type": "Feature",
+                    "geometry": {
+                        "type": "Point",
+                        "coordinates": [
+                            round(c.coordinates[1], 3),
+                            round(c.coordinates[0], 3),
+                        ],
+                    },
+                    "properties": {
+                        "user_id": c.user_id,
+                        "planning_area": c.planning_area,
+                        "housing_type": c.housing_type.value,
+                        "causes": c.causes,
+                        "is_donor": c.is_donor,
+                        "final_score": round(s["final_score"], 3),
+                    },
+                }
+            )
+        geojson = {"type": "FeatureCollection", "features": features}
+    return LookalikeResponse(
+        seed_causes=request.seed_causes,
+        total_found=len(scored),
+        tiers=tiers,
+        geojson=geojson,
+    )
+@app.post("/clients/seed-mock-data")
+async def seed_mock_client_data(count: int = 100):
+    """
+    Seed the database with mock client profiles for testing.
+    This populates the vector store with realistic Singapore client data.
+    """
+    if not encoder:
+        raise HTTPException(status_code=503, detail="Encoder not initialized")
+    if not vector_store:
+        raise HTTPException(status_code=503, detail="Database not connected")
+    try:
+        from recommender.gis_recommender import generate_mock_clients
+        clients = generate_mock_clients(count)
+        registered = 0
+        for client in clients:
+            text = client.to_embedding_text()
+            embedding = await encoder.encode(text)
+            form_data = client.to_dict()
+            form_data["country"] = "SG"
+            await vector_store.store_embedding(
+                form_id=client.user_id,
+                form_type="client",
+                embedding=embedding,
+                form_data=form_data,
+            )
+            registered += 1
+        return {
+            "message": f"Seeded {registered} mock client profiles",
+            "count": registered,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/debug/database-stats")
+async def get_database_stats():
+    """
+    Debug endpoint to check what's stored in the vector database.
+    Returns counts of donors, volunteers, and clients in the database.
+    """
+    if not vector_store:
+        return {"error": "Database not connected", "stats": None}
+    try:
+        async with vector_store.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                # Count by form_type
+                await cur.execute(
+                    """
+                    SELECT
+                        metadata->>'form_type' as form_type,
+                        COUNT(*) as count
+                    FROM my_embeddings
+                    GROUP BY metadata->>'form_type'
+                    ORDER BY count DESC
+                """
+                )
+                type_counts = await cur.fetchall()
+                # Get sample entries
+                await cur.execute(
+                    """
+                    SELECT source_id, metadata->>'form_type',
+                           LEFT(text_content::text, 200) as preview
+                    FROM my_embeddings
+                    ORDER BY id DESC
+                    LIMIT 10
+                """
+                )
+                recent = await cur.fetchall()
+        return {
+            "connected": True,
+            "form_type_counts": {row[0]: row[1] for row in type_counts},
+            "total_entries": sum(row[1] for row in type_counts),
+            "recent_entries": [
+                {"id": row[0], "form_type": row[1], "preview": row[2]} for row in recent
+            ],
+        }
+    except Exception as e:
+        return {"error": str(e), "stats": None}
+@app.get("/clients/map-demographics")
+async def get_map_demographics(
+    causes: Optional[str] = None,  # Comma-separated causes
+    include_donors: bool = True,
+    include_clients: bool = True,
+):
+    """
+    Get aggregated demographics data for Singapore map visualization.
+    Returns:
+    - Planning area aggregates (donor counts, cause distribution, housing breakdown)
+    - Individual donor/client points with coordinates
+    - Demographics summary for clusters
+    """
+    from recommender.gis_recommender import (
+        PLANNING_AREAS,
+        HousingType,
+        HOUSING_INCOME_PROXY,
+    )
+    if not vector_store:
+        # Return mock data if database not available
+        return await _generate_mock_map_demographics(causes)
+    try:
+        cause_list = causes.split(",") if causes else None
+        # Query all donors and clients from database
+        all_entries = []
+        if include_donors:
+            donor_results = await vector_store.find_by_form_type("donor", limit=500)
+            all_entries.extend(donor_results)
+        if include_clients:
+            client_results = await vector_store.find_by_form_type("client", limit=500)
+            all_entries.extend(client_results)
+        # Aggregate by planning area
+        area_stats = {}
+        individual_points = []
+        for entry in all_entries:
+            form_data = (
+                entry.form_data
+                if hasattr(entry, "form_data")
+                else entry.get("form_data", {})
+            )
+            entry_id = entry.id if hasattr(entry, "id") else entry.get("id", "")
+            form_type = (
+                entry.form_type
+                if hasattr(entry, "form_type")
+                else entry.get("form_type", "")
+            )
+            # Get planning area
+            planning_area = form_data.get("planning_area", "unknown")
+            if planning_area == "unknown" and form_data.get("country") == "SG":
+                # Infer planning area from ID hash for donors without explicit area
+                import hashlib
+                area_list = list(PLANNING_AREAS.keys())
+                idx = int(hashlib.md5(entry_id.encode()).hexdigest(), 16) % len(
+                    area_list
+                )
+                planning_area = area_list[idx]
+            # Get causes
+            entry_causes = form_data.get("causes", [])
+            if isinstance(entry_causes, str):
+                entry_causes = [entry_causes]
+            # Filter by causes if specified
+            if cause_list:
+                if not any(c in entry_causes for c in cause_list):
+                    continue
+            # Get housing type
+            housing_type = form_data.get("housing_type", "hdb_4_room")
+            amount_range = form_data.get("amount_range", "")
+            if not housing_type or housing_type == "unknown":
+                # Infer from amount_range
+                if "10000" in str(amount_range) or "5000" in str(amount_range):
+                    housing_type = "landed"
+                elif "1000" in str(amount_range):
+                    housing_type = "condo"
+                elif "500" in str(amount_range):
+                    housing_type = "hdb_executive"
+                else:
+                    housing_type = "hdb_4_room"
+            # Get coordinates
+            if planning_area in PLANNING_AREAS:
+                area_info = PLANNING_AREAS[planning_area]
+                lat = area_info["lat"] + (hash(entry_id) % 100 - 50) * 0.0005
+                lng = area_info["lng"] + (hash(entry_id[::-1]) % 100 - 50) * 0.0005
+            else:
+                lat, lng = 1.3521, 103.8198  # Singapore center
+            # Aggregate by area
+            if planning_area not in area_stats:
+                area_stats[planning_area] = {
+                    "name": PLANNING_AREAS.get(planning_area, {}).get(
+                        "name", planning_area.replace("_", " ").title()
+                    ),
+                    "lat": PLANNING_AREAS.get(planning_area, {}).get("lat", 1.3521),
+                    "lng": PLANNING_AREAS.get(planning_area, {}).get("lng", 103.8198),
+                    "total_count": 0,
+                    "donor_count": 0,
+                    "client_count": 0,
+                    "causes": {},
+                    "housing_breakdown": {},
+                    "avg_income_proxy": 0,
+                    "income_proxies": [],
+                }
+            stats = area_stats[planning_area]
+            stats["total_count"] += 1
+            if form_type == "donor":
+                stats["donor_count"] += 1
+            else:
+                stats["client_count"] += 1
+            # Count causes
+            for cause in entry_causes:
+                stats["causes"][cause] = stats["causes"].get(cause, 0) + 1
+            # Count housing
+            stats["housing_breakdown"][housing_type] = (
+                stats["housing_breakdown"].get(housing_type, 0) + 1
+            )
+            # Track income proxy
+            try:
+                income_proxy = HOUSING_INCOME_PROXY.get(HousingType(housing_type), 0.5)
+            except:
+                income_proxy = 0.5
+            stats["income_proxies"].append(income_proxy)
+            # Add individual point
+            individual_points.append(
+                {
+                    "id": entry_id,
+                    "type": form_type,
+                    "lat": lat,
+                    "lng": lng,
+                    "planning_area": planning_area,
+                    "housing_type": housing_type,
+                    "causes": entry_causes[:5],  # Limit for performance
+                    "is_donor": form_type == "donor",
+                }
+            )
+        # Calculate averages
+        for area, stats in area_stats.items():
+            if stats["income_proxies"]:
+                stats["avg_income_proxy"] = round(
+                    sum(stats["income_proxies"]) / len(stats["income_proxies"]), 3
+                )
+            del stats["income_proxies"]
+        # Create GeoJSON for areas (polygons would need actual boundary data, using circles)
+        area_geojson = {
+            "type": "FeatureCollection",
+            "features": [
+                {
+                    "type": "Feature",
+                    "geometry": {
+                        "type": "Point",
+                        "coordinates": [stats["lng"], stats["lat"]],
+                    },
+                    "properties": {
+                        "planning_area": area,
+                        "name": stats["name"],
+                        **{k: v for k, v in stats.items() if k not in ["lat", "lng"]},
+                    },
+                }
+                for area, stats in area_stats.items()
+            ],
+        }
+        # Create GeoJSON for individual points
+        points_geojson = {
+            "type": "FeatureCollection",
+            "features": [
+                {
+                    "type": "Feature",
+                    "geometry": {
+                        "type": "Point",
+                        "coordinates": [p["lng"], p["lat"]],
+                    },
+                    "properties": {
+                        "id": p["id"],
+                        "type": p["type"],
+                        "planning_area": p["planning_area"],
+                        "housing_type": p["housing_type"],
+                        "causes": p["causes"],
+                        "is_donor": p["is_donor"],
+                    },
+                }
+                for p in individual_points
+            ],
+        }
+        # Summary statistics
+        all_causes = {}
+        all_housing = {}
+        for stats in area_stats.values():
+            for cause, count in stats["causes"].items():
+                all_causes[cause] = all_causes.get(cause, 0) + count
+            for housing, count in stats["housing_breakdown"].items():
+                all_housing[housing] = all_housing.get(housing, 0) + count
+        return {
+            "total_donors": sum(s["donor_count"] for s in area_stats.values()),
+            "total_clients": sum(s["client_count"] for s in area_stats.values()),
+            "areas_with_data": len(area_stats),
+            "summary": {
+                "top_causes": sorted(
+                    all_causes.items(), key=lambda x: x[1], reverse=True
+                )[:10],
+                "housing_distribution": all_housing,
+            },
+            "area_aggregates": area_geojson,
+            "individual_points": points_geojson,
+            "planning_areas": PLANNING_AREAS,
+        }
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return await _generate_mock_map_demographics(causes)
+async def _generate_mock_map_demographics(causes: Optional[str] = None):
+    """Generate mock demographics data for map visualization."""
+    from recommender.gis_recommender import (
+        PLANNING_AREAS,
+        HOUSING_INCOME_PROXY,
+        HousingType,
+    )
+    import random
+    cause_list = (
+        causes.split(",")
+        if causes
+        else ["education", "animals", "poverty", "environment", "health"]
+    )
+    area_stats = {}
+    individual_points = []
+    for area_id, area_info in PLANNING_AREAS.items():
+        count = random.randint(3, 25)
+        donors = random.randint(1, count)
+        area_stats[area_id] = {
+            "name": area_info["name"],
+            "lat": area_info["lat"],
+            "lng": area_info["lng"],
+            "total_count": count,
+            "donor_count": donors,
+            "client_count": count - donors,
+            "causes": {
+                cause: random.randint(1, count)
+                for cause in random.sample(cause_list, min(3, len(cause_list)))
+            },
+            "housing_breakdown": {
+                "hdb_4_room": random.randint(0, count // 2),
+                "condo": random.randint(0, count // 3),
+                "landed": random.randint(0, count // 4),
+            },
+            "avg_income_proxy": round(random.uniform(0.3, 0.8), 3),
+        }
+        # Generate individual points
+        for i in range(count):
+            lat = area_info["lat"] + (random.random() - 0.5) * 0.02
+            lng = area_info["lng"] + (random.random() - 0.5) * 0.02
+            housing_types = [
+                "hdb_3_room",
+                "hdb_4_room",
+                "hdb_5_room",
+                "hdb_executive",
+                "condo",
+                "landed",
+            ]
+            individual_points.append(
+                {
+                    "id": f"mock_{area_id}_{i}",
+                    "type": "donor" if i < donors else "client",
+                    "lat": lat,
+                    "lng": lng,
+                    "planning_area": area_id,
+                    "housing_type": random.choice(housing_types),
+                    "causes": random.sample(cause_list, min(2, len(cause_list))),
+                    "is_donor": i < donors,
+                }
+            )
+    # Create GeoJSON
+    area_geojson = {
+        "type": "FeatureCollection",
+        "features": [
+            {
+                "type": "Feature",
+                "geometry": {
+                    "type": "Point",
+                    "coordinates": [stats["lng"], stats["lat"]],
+                },
+                "properties": {
+                    "planning_area": area,
+                    "name": stats["name"],
+                    **{k: v for k, v in stats.items() if k not in ["lat", "lng"]},
+                },
+            }
+            for area, stats in area_stats.items()
+        ],
+    }
+    points_geojson = {
+        "type": "FeatureCollection",
+        "features": [
+            {
+                "type": "Feature",
+                "geometry": {"type": "Point", "coordinates": [p["lng"], p["lat"]]},
+                "properties": {k: v for k, v in p.items() if k not in ["lat", "lng"]},
+            }
+            for p in individual_points
+        ],
+    }
+    return {
+        "total_donors": sum(s["donor_count"] for s in area_stats.values()),
+        "total_clients": sum(s["client_count"] for s in area_stats.values()),
+        "areas_with_data": len(area_stats),
+        "summary": {
+            "top_causes": [(c, random.randint(10, 50)) for c in cause_list[:5]],
+            "housing_distribution": {
+                "hdb_4_room": 120,
+                "condo": 45,
+                "landed": 20,
+                "hdb_5_room": 30,
+            },
+        },
+        "area_aggregates": area_geojson,
+        "individual_points": points_geojson,
+        "planning_areas": PLANNING_AREAS,
+    }
+@app.get("/debug/search-donors")
+async def debug_search_donors(cause: str = "education", limit: int = 10):
+    """
+    Debug endpoint to directly search for donors in the database.
+    This bypasses the GIS recommender to see raw database results.
+    """
+    if not encoder or not vector_store:
+        return {"error": "Encoder or database not available"}
+    try:
+        # Create a simple query embedding
+        query_text = f"Donor interested in {cause} causes, looking to support {cause} initiatives"
+        query_embedding = await encoder.encode(query_text)
+        # Search for donors
+        donor_results = await vector_store.find_similar(
+            query_embedding=query_embedding,
+            form_type="donor",
+            limit=limit,
+        )
+        # Also search for clients
+        client_results = await vector_store.find_similar(
+            query_embedding=query_embedding,
+            form_type="client",
+            limit=limit,
+        )
+        return {
+            "query_cause": cause,
+            "donor_results": [
+                {
+                    "id": r.id,
+                    "form_type": r.form_type,
+                    "score": round(r.score, 4),
+                    "distance": round(r.distance, 4),
+                    "causes": r.form_data.get("causes", []),
+                    "country": r.form_data.get("country"),
+                }
+                for r in donor_results
+            ],
+            "client_results": [
+                {
+                    "id": r.id,
+                    "form_type": r.form_type,
+                    "score": round(r.score, 4),
+                    "distance": round(r.distance, 4),
+                    "causes": r.form_data.get("causes", []),
+                    "planning_area": r.form_data.get("planning_area"),
+                }
+                for r in client_results
+            ],
+            "total_donors": len(donor_results),
+            "total_clients": len(client_results),
+        }
+    except Exception as e:
+        import traceback
+        return {"error": str(e), "traceback": traceback.format_exc()}
+# ============================================================================
+# Main
+# ============================================================================
+if __name__ == "__main__":
+    import uvicorn
+    # Windows-specific fix: must be set before uvicorn starts its event loop
+    if sys.platform == "win32":
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+    uvicorn.run(app, host="0.0.0.0", port=7860)

encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Text encoders for embedding generation."""
+from .base import BaseEncoder
+from .sealion import SeaLionEncoder
+__all__ = ["BaseEncoder", "SeaLionEncoder"]

encoders/base.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Base encoder abstract class."""
+from abc import ABC, abstractmethod
+from typing import List
+import numpy as np
+class BaseEncoder(ABC):
+    """Base class for text encoders.
+    Provides a common interface for encoding text into vector embeddings.
+    Implementations can use different models (SeaLion, OpenAI, etc.).
+    """
+    @property
+    @abstractmethod
+    def embedding_dimension(self) -> int:
+        """Return the native embedding dimension of this encoder."""
+        pass
+    @abstractmethod
+    async def encode(self, text: str) -> np.ndarray:
+        """Encode a single text into a vector.
+        Args:
+            text: The text to encode.
+        Returns:
+            A numpy array of shape (embedding_dimension,).
+        """
+        pass
+    @abstractmethod
+    async def encode_batch(self, texts: List[str]) -> np.ndarray:
+        """Encode multiple texts into vectors (batch processing).
+        Args:
+            texts: List of texts to encode.
+        Returns:
+            A numpy array of shape (len(texts), embedding_dimension).
+        """
+        pass

encoders/sealion.py ADDED Viewed

	@@ -0,0 +1,382 @@

+"""SeaLion encoder for ASEAN multilingual form analysis.
+Uses SeaLion chat API to analyze donor/volunteer forms and extract
+structured features for embedding generation. SeaLion is chosen for its
+knowledge of ASEAN nations and multilingual capabilities.
+API Details:
+- Base URL: Set via SEALION_ENDPOINT environment variable
+- Endpoint: POST /chat
+- Request: {"prompt": "...", "system": "..."}
+- Response: OpenAI-compatible format with choices[0].message.content
+"""
+import os
+import httpx
+import json
+import hashlib
+import numpy as np
+from typing import List, Optional, Dict, Any
+from .base import BaseEncoder
+# Feature categories for encoding (used for vector generation)
+CAUSE_CATEGORIES = [
+    "education", "health", "environment", "poverty", "children",
+    "elderly", "disability", "animals", "arts", "sports",
+    "disaster_relief", "human_rights", "technology", "agriculture", "housing"
+]
+ASEAN_COUNTRIES = ["SG", "MY", "TH", "VN", "ID", "PH", "MM", "KH", "LA", "BN"]
+LANGUAGES = ["en", "ms", "th", "vi", "id", "tl", "my", "km", "lo", "zh"]
+AVAILABILITY_TYPES = ["weekends", "evenings", "flexible", "full_time", "event_based"]
+DONOR_TYPES = ["individual", "corporate", "foundation"]
+VOLUNTEER_TYPES = ["regular", "event_based", "skilled"]
+class SeaLionEncoder(BaseEncoder):
+    """SeaLion encoder using chat API for form analysis.
+    Uses SeaLion's ASEAN knowledge and multilingual capabilities to:
+    1. Analyze form content semantically
+    2. Extract structured features
+    3. Generate embeddings suitable for similarity matching
+    The encoder combines:
+    - Feature extraction via SeaLion chat API
+    - Deterministic feature hashing for categorical data
+    - Semantic scoring from LLM analysis
+    """
+    # Fixed embedding dimension (matches Supabase EMBED_DIMENSION)
+    _feature_dimension: int = 1024
+    def __init__(
+        self,
+        endpoint_url: Optional[str] = None,
+        timeout: float = 60.0,
+        max_retries: int = 3
+    ):
+        """Initialize SeaLion encoder.
+        Args:
+            endpoint_url: The SeaLion API base URL. If not provided,
+                         reads from SEALION_ENDPOINT environment variable.
+            timeout: Request timeout in seconds.
+            max_retries: Maximum number of retry attempts on failure.
+        """
+        url = endpoint_url or os.getenv("SEALION_ENDPOINT")
+        if not url:
+            raise ValueError("SEALION_ENDPOINT environment variable is required")
+        self.endpoint_url = url.rstrip("/")
+        self.timeout = timeout
+        self.max_retries = max_retries
+    @property
+    def embedding_dimension(self) -> int:
+        """Return the embedding dimension (fixed at 1024)."""
+        return self._feature_dimension
+    def _build_system_prompt(self) -> str:
+        """Build system prompt for SeaLion analysis."""
+        return """You are an ASEAN donor/volunteer profile analyzer. Your task is to analyze form data and extract structured features for matching.
+Analyze the provided form and respond with a JSON object containing these fields:
+1. "causes": List of relevant cause categories from: education, health, environment, poverty, children, elderly, disability, animals, arts, sports, disaster_relief, human_rights, technology, agriculture, housing
+2. "cause_scores": Object with scores (0.0-1.0) for each relevant cause based on text sentiment and context
+3. "engagement_level": Score from 0.0 to 1.0 indicating commitment level (based on frequency, bio, motivation)
+4. "experience_level": Score from 0.0 to 1.0 indicating prior experience
+5. "financial_capacity": Score from 0.0 to 1.0 for donors (based on amount range, donor type)
+6. "skills_diversity": Score from 0.0 to 1.0 for volunteers (based on skills listed)
+7. "language_diversity": Score from 0.0 to 1.0 based on languages spoken
+8. "motivation_themes": List of key themes extracted from bio/motivation/goals text
+9. "regional_focus": Score from 0.0 to 1.0 indicating focus on ASEAN vs global causes
+Respond ONLY with valid JSON, no explanation."""
+    async def _call_sealion(self, prompt: str) -> str:
+        """Call SeaLion chat API.
+        Args:
+            prompt: The user prompt to send.
+        Returns:
+            The response text from SeaLion.
+        Raises:
+            httpx.HTTPStatusError: If request fails after retries.
+        """
+        last_error = None
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            for attempt in range(self.max_retries):
+                try:
+                    response = await client.post(
+                        f"{self.endpoint_url}/chat",
+                        headers={"Content-Type": "application/json"},
+                        json={
+                            "prompt": prompt,
+                            "system": self._build_system_prompt()
+                        }
+                    )
+                    response.raise_for_status()
+                    data = response.json()
+                    # Handle OpenAI-compatible format (choices array)
+                    if 'choices' in data and len(data['choices']) > 0:
+                        choice = data['choices'][0]
+                        if 'message' in choice:
+                            return choice['message'].get('content', '')
+                        if 'text' in choice:
+                            return choice['text']
+                    # Fallback to simple format
+                    return data.get("response", "")
+                except httpx.HTTPStatusError as e:
+                    last_error = e
+                    if e.response.status_code >= 500:
+                        continue
+                    raise
+                except httpx.RequestError as e:
+                    last_error = e
+                    continue
+            if last_error:
+                raise last_error
+            raise RuntimeError("SeaLion API call failed")
+    def _parse_sealion_response(self, response: str) -> Dict[str, Any]:
+        """Parse SeaLion JSON response.
+        Args:
+            response: Raw response text from SeaLion.
+        Returns:
+            Parsed JSON dictionary, or empty dict if parsing fails.
+        """
+        try:
+            # Try to extract JSON from response (may have extra text)
+            start = response.find("{")
+            end = response.rfind("}") + 1
+            if start >= 0 and end > start:
+                json_str = response[start:end]
+                return json.loads(json_str)
+        except json.JSONDecodeError:
+            pass
+        return {}
+    def _hash_to_vector(self, text: str, dimension: int, offset: int = 0) -> np.ndarray:
+        """Convert text to a deterministic vector using hashing.
+        Args:
+            text: Text to hash.
+            dimension: Output vector dimension.
+            offset: Offset into the full embedding space.
+        Returns:
+            A sparse vector contribution.
+        """
+        vector = np.zeros(self._feature_dimension, dtype=np.float32)
+        if not text:
+            return vector
+        # Use SHA256 for deterministic hashing
+        hash_bytes = hashlib.sha256(text.lower().encode()).digest()
+        # Convert to indices and values
+        for i in range(0, min(len(hash_bytes), dimension), 2):
+            idx = (hash_bytes[i] + offset) % self._feature_dimension
+            val = (hash_bytes[i + 1] / 255.0) * 2 - 1  # Normalize to [-1, 1]
+            vector[idx] += val
+        return vector
+    def _encode_categorical(
+        self,
+        value: str,
+        categories: List[str],
+        start_idx: int
+    ) -> np.ndarray:
+        """One-hot encode a categorical value.
+        Args:
+            value: The value to encode.
+            categories: List of possible categories.
+            start_idx: Starting index in the embedding vector.
+        Returns:
+            Embedding contribution from this categorical.
+        """
+        vector = np.zeros(self._feature_dimension, dtype=np.float32)
+        value_lower = value.lower() if value else ""
+        for i, cat in enumerate(categories):
+            if cat.lower() in value_lower or value_lower in cat.lower():
+                idx = (start_idx + i) % self._feature_dimension
+                vector[idx] = 1.0
+                break
+        return vector
+    def _encode_multi_categorical(
+        self,
+        values: List[str],
+        categories: List[str],
+        start_idx: int
+    ) -> np.ndarray:
+        """Multi-hot encode a list of categorical values.
+        Args:
+            values: List of values to encode.
+            categories: List of possible categories.
+            start_idx: Starting index in the embedding vector.
+        Returns:
+            Embedding contribution from these categoricals.
+        """
+        vector = np.zeros(self._feature_dimension, dtype=np.float32)
+        values_lower = [v.lower() for v in values] if values else []
+        for i, cat in enumerate(categories):
+            cat_lower = cat.lower()
+            for val in values_lower:
+                if cat_lower in val or val in cat_lower:
+                    idx = (start_idx + i) % self._feature_dimension
+                    vector[idx] = 1.0
+                    break
+        return vector
+    def _build_embedding_from_features(
+        self,
+        form_text: str,
+        features: Dict[str, Any]
+    ) -> np.ndarray:
+        """Build final embedding from extracted features.
+        Combines:
+        - Deterministic text hashing (semantic coverage)
+        - One-hot/multi-hot categorical encoding
+        - Continuous scores from SeaLion analysis
+        Args:
+            form_text: Original form text for hashing.
+            features: Extracted features from SeaLion.
+        Returns:
+            Final embedding vector of shape (1024,).
+        """
+        embedding = np.zeros(self._feature_dimension, dtype=np.float32)
+        # Section 1 (indices 0-255): Text hash for semantic similarity
+        embedding += self._hash_to_vector(form_text, 256, offset=0)
+        # Section 2 (indices 256-511): Cause categories
+        causes = features.get("causes", [])
+        embedding += self._encode_multi_categorical(causes, CAUSE_CATEGORIES, 256)
+        # Section 3 (indices 512-527): Cause scores
+        cause_scores = features.get("cause_scores", {})
+        for i, cause in enumerate(CAUSE_CATEGORIES):
+            idx = 512 + i
+            if cause in cause_scores:
+                embedding[idx] = float(cause_scores[cause])
+        # Section 4 (indices 528-537): Country encoding
+        # Extract country from form text
+        for i, country in enumerate(ASEAN_COUNTRIES):
+            if country.lower() in form_text.lower():
+                embedding[528 + i] = 1.0
+        # Section 5 (indices 538-547): Language encoding
+        for i, lang in enumerate(LANGUAGES):
+            if lang.lower() in form_text.lower():
+                embedding[538 + i] = 1.0
+        # Section 6 (indices 548-557): Continuous scores
+        embedding[548] = features.get("engagement_level", 0.5)
+        embedding[549] = features.get("experience_level", 0.5)
+        embedding[550] = features.get("financial_capacity", 0.5)
+        embedding[551] = features.get("skills_diversity", 0.5)
+        embedding[552] = features.get("language_diversity", 0.5)
+        embedding[553] = features.get("regional_focus", 0.5)
+        # Section 7 (indices 558-600): Donor/volunteer type encoding
+        embedding += self._encode_categorical(
+            form_text, DONOR_TYPES, 558
+        )
+        embedding += self._encode_categorical(
+            form_text, VOLUNTEER_TYPES, 563
+        )
+        embedding += self._encode_categorical(
+            form_text, AVAILABILITY_TYPES, 568
+        )
+        # Section 8 (indices 600-1023): Motivation themes hash
+        themes = features.get("motivation_themes", [])
+        if themes:
+            themes_text = " ".join(themes)
+            embedding += self._hash_to_vector(themes_text, 424, offset=600)
+        # Normalize the embedding
+        norm = np.linalg.norm(embedding)
+        if norm > 0:
+            embedding = embedding / norm
+        return embedding
+    async def encode(self, text: str) -> np.ndarray:
+        """Encode form text using SeaLion analysis.
+        Process:
+        1. Send form text to SeaLion for semantic analysis
+        2. Parse extracted features from response
+        3. Build embedding from features + text hashing
+        Args:
+            text: The form text to encode.
+        Returns:
+            A numpy array of shape (1024,).
+        """
+        # Get SeaLion analysis
+        response = await self._call_sealion(
+            f"Analyze this donor/volunteer form:\n\n{text}"
+        )
+        features = self._parse_sealion_response(response)
+        # Build embedding from features
+        return self._build_embedding_from_features(text, features)
+    async def encode_batch(self, texts: List[str]) -> np.ndarray:
+        """Encode multiple form texts.
+        Note: This makes sequential API calls since the SeaLion API
+        doesn't support batch requests.
+        Args:
+            texts: List of form texts to encode.
+        Returns:
+            A numpy array of shape (len(texts), 1024).
+        """
+        if not texts:
+            return np.zeros((0, self._feature_dimension), dtype=np.float32)
+        embeddings = []
+        for text in texts:
+            emb = await self.encode(text)
+            embeddings.append(emb)
+        return np.vstack(embeddings)

graph/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""LangGraph chat graph components."""
+from .builder import build_graph_with_memory
+from .state import State
+from .router import router
+__all__ = ["build_graph_with_memory", "State", "router"]

graph/builder.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+from langgraph.graph import StateGraph, START, END
+from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver
+from langgraph.store.postgres.aio import AsyncPostgresStore
+from psycopg_pool import AsyncConnectionPool
+from langchain_ollama import ChatOllama
+from .state import State
+from .router import router
+from agents.therapist import TherapistAgent
+from agents.logical import LogicalAgent
+from agents.classifier import create_classifier
+from agents.charity_search import CharitySearchAgent
+from agents.agentic_rag import AgenticRAGAgent
+from encoders.sealion import SeaLionEncoder
+from recommender.vector_store import DonorVectorStore
+def create_connection_string() -> str:
+    """Build PostgreSQL connection string from environment variables."""
+    db_host = os.getenv("SUPABASE_DB_HOST", "localhost")
+    db_port = os.getenv("SUPABASE_DB_PORT", "6543")
+    db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
+    db_user = os.getenv("SUPABASE_DB_USER", "postgres")
+    db_password = os.getenv("SUPABASE_DB_PASSWORD", "")
+    db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
+    return (
+        f"postgres://{db_user}:{db_password}"
+        f"@{db_host}:{db_port}/{db_name}"
+        f"?sslmode={db_sslmode}"
+    )
+def create_async_pool() -> AsyncConnectionPool:
+    """Create AsyncConnectionPool with proper settings."""
+    return AsyncConnectionPool(
+        conninfo=create_connection_string(),
+        max_size=20,
+        kwargs={
+            "autocommit": True,
+            "prepare_threshold": None,
+        }
+    )
+async def build_graph_with_memory():
+    """Build the graph with Supabase-backed checkpointer and store."""
+    # Create async connection pool
+    pool = create_async_pool()
+    await pool.open()
+    # Create checkpointer and store from the pool
+    checkpointer = AsyncPostgresSaver(pool)
+    store = AsyncPostgresStore(pool)
+    # Setup tables for store and checkpointer
+    print("\n[Setup] Setting up LangGraph store and checkpointer tables...")
+    await checkpointer.setup()
+    await store.setup()
+    print("[OK] Store and checkpointer tables created!\n")
+    # Use Ollama cloud with API key authentication
+    api_key = os.getenv('OLLAMA_API_KEY')
+    if api_key:
+        llm = ChatOllama(
+            model="gpt-oss:120b",
+            base_url="https://ollama.com",
+            client_kwargs={
+                "headers": {"Authorization": f"Bearer {api_key}"}
+            }
+        )
+    else:
+        # Fallback to local Ollama if no API key
+        llm = ChatOllama(model="gpt-oss:120b-cloud")
+    # Initialize encoder and vector store for Agentic RAG
+    encoder = None
+    vector_store = None
+    try:
+        sealion_endpoint = os.getenv("SEALION_ENDPOINT")
+        if sealion_endpoint:
+            encoder = SeaLionEncoder(endpoint_url=sealion_endpoint)
+            vector_store = DonorVectorStore(pool)
+            print("[OK] Agentic RAG initialized with SeaLion encoder\n")
+    except Exception as e:
+        print(f"[WARN] Agentic RAG not available: {e}\n")
+    # Create Agentic RAG agent
+    agentic_rag_agent = AgenticRAGAgent(llm, encoder, vector_store)
+    # Build the graph
+    graph_builder = StateGraph(State)
+    graph_builder.add_node("classifier", create_classifier(llm))
+    graph_builder.add_node("therapist", TherapistAgent(llm))
+    graph_builder.add_node("logical", LogicalAgent(llm))
+    graph_builder.add_node("charity_search", CharitySearchAgent(llm))
+    graph_builder.add_node("agentic_rag", agentic_rag_agent)
+    graph_builder.add_edge(START, "classifier")
+    graph_builder.add_conditional_edges(
+        "classifier",
+        router,
+        {
+            "therapist": "therapist",
+            "logical": "logical",
+            "charity_search": "charity_search",
+            "agentic_rag": "agentic_rag"
+        }
+    )
+    graph_builder.add_edge("therapist", END)
+    graph_builder.add_edge("logical", END)
+    graph_builder.add_edge("charity_search", END)
+    graph_builder.add_edge("agentic_rag", END)
+    # Compile with store and checkpointer
+    graph = graph_builder.compile(
+        checkpointer=checkpointer,
+        store=store,
+    )
+    return graph, store, checkpointer

graph/router.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .state import State
+def router(state: State):
+    """Route to appropriate agent based on message type."""
+    message_type = state.get("message_type", "logical")
+    if message_type == "emotional":
+        return "therapist"
+    elif message_type == "charity_search":
+        return "charity_search"
+    elif message_type == "donor_search":
+        return "agentic_rag"
+    elif message_type == "volunteer_search":
+        return "agentic_rag"
+    return "logical"

graph/state.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from typing_extensions import TypedDict
+from typing import Annotated
+from langgraph.graph.message import add_messages
+class State(TypedDict):
+    messages: Annotated[list, add_messages]
+    message_type: str | None

recommender/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Recommender system components."""
+from .vector_store import DonorVectorStore, SimilarityResult
+from .gis_recommender import (
+    GISRecommender,
+    ClientProfile,
+    ScoredClient,
+    HousingType,
+    PLANNING_AREAS,
+    HOUSING_INCOME_PROXY,
+    generate_mock_clients,
+    generate_seed_donor_profile,
+    EmbeddingReducer,
+    HybridSemanticSpatialEncoder,
+)
+__all__ = [
+    "DonorVectorStore",
+    "SimilarityResult",
+    "GISRecommender",
+    "ClientProfile",
+    "ScoredClient",
+    "HousingType",
+    "PLANNING_AREAS",
+    "HOUSING_INCOME_PROXY",
+    "generate_mock_clients",
+    "generate_seed_donor_profile",
+    "EmbeddingReducer",
+    "HybridSemanticSpatialEncoder",
+]

recommender/gis_recommender.py ADDED Viewed

	@@ -0,0 +1,1202 @@

+"""
+GIS-based Donor/Client Recommender System for ASEAN targeting.
+This module implements:
+1. Lookalike Retrieval: Find top-K nearest neighbors using cosine similarity
+2. Spatial Filtering: Geo-fence filtering by Singapore planning areas
+3. Tiered Targeting: Ranking based on vector similarity, spatial proxy, and donation history
+4. GeoJSON Export: Output for map-based dashboard visualization
+5. Dimensionality Reduction: PCA for compact semantic representation
+Privacy Note:
+- PII (names, exact addresses) are stored as encrypted metadata, NOT in the vector
+- Coordinates are stored with reduced precision (3 decimal places ~100m accuracy)
+- Only behavioral/interest data is embedded in the vector space
+Dimensionality Reduction Strategy:
+- Store BOTH full 1024-dim embedding AND reduced representation
+- Reduced dimensions (2D/3D) enable:
+  1. Better matching with small datasets (less noise)
+  2. Combination with geo-coordinates for hybrid semantic-spatial search
+  3. Visualization in 2D/3D space
+"""
+import json
+import hashlib
+from typing import List, Optional, Dict, Any, Tuple, Union
+from dataclasses import dataclass, field, asdict
+from enum import Enum
+import numpy as np
+# ============================================================================
+# Dimensionality Reduction Utilities
+# ============================================================================
+class EmbeddingReducer:
+    """
+    Reduces high-dimensional embeddings to lower dimensions using PCA.
+    For small datasets, this helps:
+    1. Remove noise from sparse dimensions
+    2. Enable combination with geo-coordinates
+    3. Improve similarity matching with limited data
+    """
+    def __init__(self, n_components: int = 8):
+        """
+        Initialize reducer.
+        Args:
+            n_components: Target dimensionality (default 8 for semantic space)
+        """
+        self.n_components = n_components
+        self._mean = None
+        self._components = None
+        self._is_fitted = False
+    def fit(self, embeddings: np.ndarray) -> "EmbeddingReducer":
+        """
+        Fit PCA on a set of embeddings.
+        Args:
+            embeddings: (N, D) array of embeddings
+        Returns:
+            self for chaining
+        """
+        if embeddings.shape[0] < 2:
+            # Not enough data to fit PCA, use identity-like projection
+            self._mean = np.zeros(embeddings.shape[1])
+            # Select top dimensions with highest variance as proxy
+            self._components = np.eye(embeddings.shape[1])[: self.n_components]
+            self._is_fitted = True
+            return self
+        # Center the data
+        self._mean = np.mean(embeddings, axis=0)
+        centered = embeddings - self._mean
+        # Simple PCA via SVD (works for small datasets)
+        try:
+            U, S, Vt = np.linalg.svd(centered, full_matrices=False)
+            self._components = Vt[: self.n_components]
+        except np.linalg.LinAlgError:
+            # SVD failed, use top-variance dimensions
+            variances = np.var(centered, axis=0)
+            top_dims = np.argsort(variances)[-self.n_components :]
+            self._components = np.eye(embeddings.shape[1])[top_dims]
+        self._is_fitted = True
+        return self
+    def transform(self, embeddings: np.ndarray) -> np.ndarray:
+        """
+        Transform embeddings to reduced dimensionality.
+        Args:
+            embeddings: (N, D) or (D,) array of embeddings
+        Returns:
+            (N, n_components) or (n_components,) reduced embeddings
+        """
+        if not self._is_fitted:
+            # Auto-fit on this data if not fitted
+            if embeddings.ndim == 1:
+                embeddings = embeddings.reshape(1, -1)
+            self.fit(embeddings)
+        single = embeddings.ndim == 1
+        if single:
+            embeddings = embeddings.reshape(1, -1)
+        centered = embeddings - self._mean
+        reduced = centered @ self._components.T
+        # Normalize to unit length for cosine similarity
+        norms = np.linalg.norm(reduced, axis=1, keepdims=True)
+        norms = np.where(norms > 0, norms, 1)
+        reduced = reduced / norms
+        return reduced[0] if single else reduced
+    def fit_transform(self, embeddings: np.ndarray) -> np.ndarray:
+        """Fit and transform in one step."""
+        return self.fit(embeddings).transform(embeddings)
+    @staticmethod
+    def compute_sparse_projection(
+        embedding: np.ndarray, n_components: int = 8
+    ) -> np.ndarray:
+        """
+        Fast projection for sparse embeddings without fitting.
+        Selects the top-k dimensions with highest absolute values.
+        Good for single queries when no training data available.
+        """
+        # Find non-zero dimensions
+        nonzero_mask = np.abs(embedding) > 1e-6
+        nonzero_indices = np.where(nonzero_mask)[0]
+        if len(nonzero_indices) <= n_components:
+            # Few enough non-zero dims, use them directly
+            result = np.zeros(n_components)
+            result[: len(nonzero_indices)] = embedding[nonzero_indices]
+        else:
+            # Take top-k by absolute value
+            top_k_in_nonzero = np.argsort(np.abs(embedding[nonzero_indices]))[
+                -n_components:
+            ]
+            top_k_indices = nonzero_indices[top_k_in_nonzero]
+            result = embedding[top_k_indices]
+        # Normalize
+        norm = np.linalg.norm(result)
+        if norm > 0:
+            result = result / norm
+        return result
+class HybridSemanticSpatialEncoder:
+    """
+    Combines semantic embeddings with geographic coordinates.
+    Creates a hybrid vector that captures both:
+    1. Semantic similarity (interests, causes)
+    2. Spatial proximity (location)
+    This enables "find people with similar interests NEAR this location"
+    without strict geo-fencing.
+    """
+    def __init__(
+        self,
+        semantic_dims: int = 8,
+        spatial_weight: float = 0.3,
+        semantic_weight: float = 0.7,
+    ):
+        """
+        Initialize hybrid encoder.
+        Args:
+            semantic_dims: Reduced semantic dimensions
+            spatial_weight: Weight for spatial component (0-1)
+            semantic_weight: Weight for semantic component (0-1)
+        """
+        self.semantic_dims = semantic_dims
+        self.spatial_weight = spatial_weight
+        self.semantic_weight = semantic_weight
+        self.reducer = EmbeddingReducer(n_components=semantic_dims)
+        # Singapore bounding box for normalization
+        self.lat_min, self.lat_max = 1.15, 1.47  # ~35km range
+        self.lng_min, self.lng_max = 103.6, 104.1  # ~55km range
+    def normalize_coordinates(self, lat: float, lng: float) -> Tuple[float, float]:
+        """Normalize coordinates to [0, 1] range within Singapore."""
+        norm_lat = (lat - self.lat_min) / (self.lat_max - self.lat_min)
+        norm_lng = (lng - self.lng_min) / (self.lng_max - self.lng_min)
+        return (np.clip(norm_lat, 0, 1), np.clip(norm_lng, 0, 1))
+    def encode(
+        self, embedding: np.ndarray, coordinates: Tuple[float, float]
+    ) -> np.ndarray:
+        """
+        Create hybrid semantic-spatial vector.
+        Args:
+            embedding: Full semantic embedding (1024-dim)
+            coordinates: (lat, lng) tuple
+        Returns:
+            Hybrid vector of dimension (semantic_dims + 2)
+        """
+        # Reduce semantic embedding
+        if embedding.ndim == 1 and len(embedding) > self.semantic_dims:
+            semantic = EmbeddingReducer.compute_sparse_projection(
+                embedding, self.semantic_dims
+            )
+        else:
+            semantic = embedding[: self.semantic_dims]
+        # Normalize spatial
+        norm_lat, norm_lng = self.normalize_coordinates(coordinates[0], coordinates[1])
+        spatial = np.array([norm_lat, norm_lng])
+        # Combine with weights
+        weighted_semantic = semantic * self.semantic_weight
+        weighted_spatial = spatial * self.spatial_weight
+        return np.concatenate([weighted_semantic, weighted_spatial])
+    def compute_similarity(
+        self, query_hybrid: np.ndarray, candidate_hybrid: np.ndarray
+    ) -> float:
+        """
+        Compute similarity between hybrid vectors.
+        Uses cosine similarity for semantic part and
+        inverse distance for spatial part.
+        """
+        semantic_dims = self.semantic_dims
+        # Semantic similarity (cosine)
+        query_semantic = query_hybrid[:semantic_dims]
+        cand_semantic = candidate_hybrid[:semantic_dims]
+        dot = np.dot(query_semantic, cand_semantic)
+        norm_q = np.linalg.norm(query_semantic)
+        norm_c = np.linalg.norm(cand_semantic)
+        if norm_q > 0 and norm_c > 0:
+            semantic_sim = dot / (norm_q * norm_c)
+        else:
+            semantic_sim = 0.0
+        # Spatial similarity (inverse euclidean distance)
+        query_spatial = query_hybrid[semantic_dims:]
+        cand_spatial = candidate_hybrid[semantic_dims:]
+        spatial_dist = np.linalg.norm(query_spatial - cand_spatial)
+        spatial_sim = 1.0 / (1.0 + spatial_dist * 10)  # Scale factor
+        # Combine
+        return self.semantic_weight * semantic_sim + self.spatial_weight * spatial_sim
+# ============================================================================
+# Singapore Planning Areas & Housing Data
+# ============================================================================
+class HousingType(str, Enum):
+    """Singapore housing types with income proxy scores."""
+    HDB_1_2_ROOM = "hdb_1_2_room"
+    HDB_3_ROOM = "hdb_3_room"
+    HDB_4_ROOM = "hdb_4_room"
+    HDB_5_ROOM = "hdb_5_room"
+    HDB_EXECUTIVE = "hdb_executive"
+    CONDO = "condo"
+    LANDED = "landed"
+    GCB = "gcb"  # Good Class Bungalow
+# Housing type to income proxy score (0-1)
+HOUSING_INCOME_PROXY = {
+    HousingType.HDB_1_2_ROOM: 0.1,
+    HousingType.HDB_3_ROOM: 0.25,
+    HousingType.HDB_4_ROOM: 0.4,
+    HousingType.HDB_5_ROOM: 0.55,
+    HousingType.HDB_EXECUTIVE: 0.65,
+    HousingType.CONDO: 0.75,
+    HousingType.LANDED: 0.85,
+    HousingType.GCB: 1.0,
+}
+# Singapore Planning Areas with approximate centroids
+PLANNING_AREAS = {
+    "ang_mo_kio": {"name": "Ang Mo Kio", "lat": 1.3691, "lng": 103.8454},
+    "bedok": {"name": "Bedok", "lat": 1.3236, "lng": 103.9273},
+    "bishan": {"name": "Bishan", "lat": 1.3526, "lng": 103.8352},
+    "bukit_batok": {"name": "Bukit Batok", "lat": 1.3590, "lng": 103.7637},
+    "bukit_merah": {"name": "Bukit Merah", "lat": 1.2819, "lng": 103.8239},
+    "bukit_panjang": {"name": "Bukit Panjang", "lat": 1.3774, "lng": 103.7719},
+    "bukit_timah": {"name": "Bukit Timah", "lat": 1.3294, "lng": 103.8021},
+    "central": {"name": "Central Area", "lat": 1.2789, "lng": 103.8536},
+    "choa_chu_kang": {"name": "Choa Chu Kang", "lat": 1.3840, "lng": 103.7470},
+    "clementi": {"name": "Clementi", "lat": 1.3162, "lng": 103.7649},
+    "geylang": {"name": "Geylang", "lat": 1.3201, "lng": 103.8918},
+    "hougang": {"name": "Hougang", "lat": 1.3612, "lng": 103.8863},
+    "jurong_east": {"name": "Jurong East", "lat": 1.3329, "lng": 103.7436},
+    "jurong_west": {"name": "Jurong West", "lat": 1.3404, "lng": 103.7090},
+    "kallang": {"name": "Kallang", "lat": 1.3100, "lng": 103.8651},
+    "marine_parade": {"name": "Marine Parade", "lat": 1.3020, "lng": 103.9072},
+    "novena": {"name": "Novena", "lat": 1.3204, "lng": 103.8438},
+    "orchard": {"name": "Orchard", "lat": 1.3048, "lng": 103.8318},
+    "pasir_ris": {"name": "Pasir Ris", "lat": 1.3721, "lng": 103.9474},
+    "punggol": {"name": "Punggol", "lat": 1.3984, "lng": 103.9072},
+    "queenstown": {"name": "Queenstown", "lat": 1.2942, "lng": 103.7861},
+    "sembawang": {"name": "Sembawang", "lat": 1.4491, "lng": 103.8185},
+    "sengkang": {"name": "Sengkang", "lat": 1.3868, "lng": 103.8914},
+    "serangoon": {"name": "Serangoon", "lat": 1.3554, "lng": 103.8679},
+    "tampines": {"name": "Tampines", "lat": 1.3496, "lng": 103.9568},
+    "toa_payoh": {"name": "Toa Payoh", "lat": 1.3343, "lng": 103.8563},
+    "woodlands": {"name": "Woodlands", "lat": 1.4382, "lng": 103.7891},
+    "yishun": {"name": "Yishun", "lat": 1.4304, "lng": 103.8354},
+}
+# ============================================================================
+# Data Models
+# ============================================================================
+@dataclass
+class ClientProfile:
+    """Client/Donor profile with spatial and behavioral data.
+    Privacy considerations:
+    - user_id is a hashed identifier, not PII
+    - coordinates are reduced precision (~100m accuracy)
+    - name_encrypted would be encrypted in production
+    Embedding Strategy:
+    - embedding: Full 1024-dim vector for accuracy at scale
+    - embedding_reduced: 8-dim compact vector for small dataset matching
+    - hybrid_embedding: Semantic + spatial combined vector
+    """
+    user_id: str
+    # Spatial data (reduced precision for privacy)
+    coordinates: Tuple[float, float]  # (lat, lng) - 3 decimal precision
+    planning_area: str
+    housing_type: HousingType
+    # Behavioral/Interest data (embedded in vector)
+    interests: List[str]
+    causes: List[str]
+    preferred_language: str
+    # Donation history
+    is_donor: bool = False
+    total_donated: float = 0.0
+    last_donation_amount: float = 0.0
+    last_org_donated: Optional[str] = None
+    donation_count: int = 0
+    # Metadata (not embedded)
+    name_encrypted: Optional[str] = None  # Would be encrypted in production
+    age_range: Optional[str] = None  # e.g., "25-34", "35-44"
+    # Vector embeddings
+    embedding: Optional[List[float]] = None  # Full 1024-dim
+    embedding_reduced: Optional[List[float]] = None  # Reduced 8-dim
+    hybrid_embedding: Optional[List[float]] = None  # Semantic + spatial (10-dim)
+    def to_embedding_text(self) -> str:
+        """Convert profile to text for embedding generation."""
+        parts = [
+            f"Planning area: {self.planning_area}",
+            f"Housing: {self.housing_type.value}",
+            f"Interests: {', '.join(self.interests)}",
+            f"Causes: {', '.join(self.causes)}",
+            f"Language: {self.preferred_language}",
+        ]
+        if self.is_donor:
+            parts.append(f"Donor with {self.donation_count} donations")
+        return "\n".join(parts)
+    def compute_reduced_embeddings(self, semantic_dims: int = 8) -> None:
+        """
+        Compute reduced and hybrid embeddings from full embedding.
+        Call this after setting the full embedding.
+        """
+        if self.embedding is None:
+            return
+        full_emb = np.array(self.embedding)
+        # Compute reduced embedding using sparse projection
+        reduced = EmbeddingReducer.compute_sparse_projection(full_emb, semantic_dims)
+        self.embedding_reduced = reduced.tolist()
+        # Compute hybrid embedding with spatial
+        encoder = HybridSemanticSpatialEncoder(semantic_dims=semantic_dims)
+        hybrid = encoder.encode(full_emb, self.coordinates)
+        self.hybrid_embedding = hybrid.tolist()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "user_id": self.user_id,
+            "coordinates": list(self.coordinates),
+            "planning_area": self.planning_area,
+            "housing_type": self.housing_type.value,
+            "interests": self.interests,
+            "causes": self.causes,
+            "preferred_language": self.preferred_language,
+            "is_donor": self.is_donor,
+            "total_donated": self.total_donated,
+            "last_donation_amount": self.last_donation_amount,
+            "last_org_donated": self.last_org_donated,
+            "donation_count": self.donation_count,
+            "age_range": self.age_range,
+            "has_reduced_embedding": self.embedding_reduced is not None,
+            "has_hybrid_embedding": self.hybrid_embedding is not None,
+        }
+@dataclass
+class ScoredClient:
+    """Client with computed targeting scores."""
+    client: ClientProfile
+    # Individual scores (0-1)
+    vector_similarity_score: float = 0.0
+    spatial_proxy_score: float = 0.0
+    proximity_score: float = 0.0
+    # Combined score
+    final_score: float = 0.0
+    # Distance from query (for debugging)
+    vector_distance: float = 0.0
+    geo_distance_km: float = 0.0
+@dataclass
+class GeoJSONFeature:
+    """GeoJSON Feature for map visualization."""
+    type: str = "Feature"
+    geometry: Dict[str, Any] = field(default_factory=dict)
+    properties: Dict[str, Any] = field(default_factory=dict)
+# ============================================================================
+# GIS Recommender System
+# ============================================================================
+class GISRecommender:
+    """
+    GIS-enhanced recommender using vector similarity + spatial targeting.
+    Features:
+    1. Lookalike retrieval using SEA-LION embeddings
+    2. Geo-fence filtering by planning area
+    3. Tiered scoring combining multiple signals
+    4. GeoJSON export for visualization
+    5. Hybrid semantic-spatial matching for small datasets
+    """
+    def __init__(self, vector_store=None, encoder=None):
+        """Initialize recommender with vector store and encoder."""
+        self.vector_store = vector_store
+        self.encoder = encoder
+        # Hybrid encoder for small dataset matching
+        self.hybrid_encoder = HybridSemanticSpatialEncoder(
+            semantic_dims=8, spatial_weight=0.3, semantic_weight=0.7
+        )
+        # Scoring weights (can be tuned)
+        self.weights = {
+            "vector_similarity": 0.5,
+            "spatial_proxy": 0.3,
+            "proximity": 0.2,
+        }
+        # Threshold for using hybrid matching
+        self.small_dataset_threshold = 100
+    @staticmethod
+    def haversine_distance(
+        coord1: Tuple[float, float], coord2: Tuple[float, float]
+    ) -> float:
+        """Calculate distance between two coordinates in kilometers."""
+        from math import radians, sin, cos, sqrt, atan2
+        lat1, lon1 = radians(coord1[0]), radians(coord1[1])
+        lat2, lon2 = radians(coord2[0]), radians(coord2[1])
+        dlat = lat2 - lat1
+        dlon = lon2 - lon1
+        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
+        c = 2 * atan2(sqrt(a), sqrt(1 - a))
+        # Earth's radius in km
+        return 6371 * c
+    @staticmethod
+    def reduce_coordinate_precision(
+        lat: float, lng: float, decimals: int = 3
+    ) -> Tuple[float, float]:
+        """Reduce coordinate precision for privacy (~100m at 3 decimals)."""
+        return (round(lat, decimals), round(lng, decimals))
+    def calculate_spatial_proxy_score(self, client: ClientProfile) -> float:
+        """Calculate income proxy score based on housing type."""
+        return HOUSING_INCOME_PROXY.get(client.housing_type, 0.5)
+    def calculate_proximity_score(
+        self, client: ClientProfile, event_locations: List[Tuple[float, float]] = None
+    ) -> float:
+        """
+        Calculate proximity score based on distance to successful donation events.
+        Lower distance = higher score.
+        """
+        if not event_locations:
+            return 0.5  # Default score if no events
+        # Find minimum distance to any event
+        min_distance = float("inf")
+        for event_coord in event_locations:
+            dist = self.haversine_distance(client.coordinates, event_coord)
+            min_distance = min(min_distance, dist)
+        # Convert distance to score (0-1)
+        # Max distance in Singapore ~40km, normalize accordingly
+        max_distance = 40.0
+        score = max(0, 1 - (min_distance / max_distance))
+        return score
+    def calculate_vector_similarity(self, distance: float) -> float:
+        """Convert L2 distance to similarity score (0-1)."""
+        return 1.0 / (1.0 + distance)
+    def find_lookalikes_hybrid(
+        self,
+        seed_profile: ClientProfile,
+        candidates: List[ClientProfile],
+        k: int = 50,
+        planning_area_filter: Optional[str] = None,
+        housing_type_filter: Optional[List[HousingType]] = None,
+    ) -> List[ScoredClient]:
+        """
+        Find lookalikes using hybrid semantic-spatial matching.
+        This method is optimized for small datasets where pure vector
+        similarity may not work well due to sparse embeddings.
+        Args:
+            seed_profile: The "ideal donor" profile to match against
+            candidates: List of candidate client profiles
+            k: Number of neighbors to retrieve
+            planning_area_filter: Optional geo-fence filter
+            housing_type_filter: Optional housing type filter
+        Returns:
+            List of ScoredClient objects ranked by hybrid similarity
+        """
+        if not seed_profile.embedding:
+            # Generate a mock embedding based on profile text
+            seed_profile.embedding = self._generate_fallback_embedding(seed_profile)
+        # Compute hybrid embedding for seed
+        seed_emb = np.array(seed_profile.embedding)
+        seed_hybrid = self.hybrid_encoder.encode(seed_emb, seed_profile.coordinates)
+        scored_clients = []
+        for client in candidates:
+            # Apply filters
+            if planning_area_filter and client.planning_area != planning_area_filter:
+                continue
+            if housing_type_filter:
+                if client.housing_type not in housing_type_filter:
+                    continue
+            # Ensure client has embedding
+            if not client.embedding:
+                client.embedding = self._generate_fallback_embedding(client)
+            # Compute hybrid embedding for candidate
+            cand_emb = np.array(client.embedding)
+            cand_hybrid = self.hybrid_encoder.encode(cand_emb, client.coordinates)
+            # Compute hybrid similarity
+            hybrid_sim = self.hybrid_encoder.compute_similarity(
+                seed_hybrid, cand_hybrid
+            )
+            # Calculate other scores
+            spatial_score = self.calculate_spatial_proxy_score(client)
+            geo_dist = self.haversine_distance(
+                seed_profile.coordinates, client.coordinates
+            )
+            proximity_score = max(0, 1 - (geo_dist / 40.0))
+            # Weighted final score
+            final_score = (
+                0.6 * hybrid_sim  # Higher weight on hybrid similarity
+                + 0.2 * spatial_score
+                + 0.2 * proximity_score
+            )
+            scored_clients.append(
+                ScoredClient(
+                    client=client,
+                    vector_similarity_score=hybrid_sim,
+                    spatial_proxy_score=spatial_score,
+                    proximity_score=proximity_score,
+                    final_score=final_score,
+                    vector_distance=1 - hybrid_sim,
+                    geo_distance_km=geo_dist,
+                )
+            )
+        # Sort by final score
+        scored_clients.sort(key=lambda x: x.final_score, reverse=True)
+        return scored_clients[:k]
+    def _generate_fallback_embedding(self, profile: ClientProfile) -> List[float]:
+        """
+        Generate a deterministic fallback embedding when encoder is unavailable.
+        Uses a hash of profile features to create a pseudo-embedding.
+        This ensures consistent matching even without the actual encoder.
+        """
+        # Create a feature string
+        features = [
+            profile.planning_area,
+            profile.housing_type.value,
+            ",".join(sorted(profile.interests)),
+            ",".join(sorted(profile.causes)),
+            profile.preferred_language,
+            str(profile.is_donor),
+        ]
+        feature_str = "|".join(features)
+        # Use hash to generate pseudo-random but deterministic values
+        hash_bytes = hashlib.sha256(feature_str.encode()).digest()
+        # Expand hash to 1024 dimensions using multiple rounds
+        embedding = []
+        for i in range(64):  # 64 rounds of 16 values each = 1024
+            seed = int.from_bytes(hash_bytes, "big") + i
+            np.random.seed(seed % (2**32))
+            chunk = np.random.randn(16) * 0.1
+            embedding.extend(chunk.tolist())
+        # Normalize
+        emb_array = np.array(embedding[:1024])
+        norm = np.linalg.norm(emb_array)
+        if norm > 0:
+            emb_array = emb_array / norm
+        return emb_array.tolist()
+    def _form_data_to_client_profile(
+        self, user_id: str, form_data: Dict[str, Any], form_type: str
+    ) -> ClientProfile:
+        """
+        Convert form data from database to ClientProfile.
+        Handles both donor forms (from /donors/register) and client forms
+        (from /clients/register) which have different field structures.
+        Donor forms have: name, donor_type, country, preferred_language, causes,
+                         donation_frequency, amount_range, bio, motivation
+        Client forms have: coordinates, planning_area, housing_type, interests,
+                          causes, preferred_language, is_donor, etc.
+        For donors without GIS data, we infer reasonable defaults based on
+        available information.
+        """
+        import random
+        # Check if this is a donor form (different structure)
+        is_donor_form = form_type == "donor" or "donor_type" in form_data
+        if is_donor_form:
+            # Convert donor form data to client profile
+            # Infer GIS data from available information
+            # Get country and infer planning area
+            country = form_data.get("country", "SG")
+            # Assign a random planning area (in production, could use IP geolocation)
+            if country == "SG":
+                planning_areas = list(PLANNING_AREAS.keys())
+                # Use hash of user_id for deterministic assignment
+                area_idx = hash(user_id) % len(planning_areas)
+                planning_area = planning_areas[area_idx]
+                area_info = PLANNING_AREAS[planning_area]
+                # Add small random offset for privacy
+                random.seed(hash(user_id))
+                lat = area_info["lat"] + random.uniform(-0.003, 0.003)
+                lng = area_info["lng"] + random.uniform(-0.003, 0.003)
+                coordinates = (round(lat, 4), round(lng, 4))
+            else:
+                # Non-SG donors - use central SG as placeholder
+                planning_area = "central"
+                coordinates = (1.2897, 103.8501)
+            # Infer housing type from amount_range (income proxy)
+            amount_range = form_data.get("amount_range", "")
+            if "5000" in amount_range or "10000" in amount_range:
+                housing_type = HousingType.LANDED
+            elif "2000" in amount_range or "3000" in amount_range:
+                housing_type = HousingType.CONDO
+            elif "1000" in amount_range:
+                housing_type = HousingType.HDB_EXECUTIVE
+            elif "500" in amount_range:
+                housing_type = HousingType.HDB_5_ROOM
+            elif "100" in amount_range or "200" in amount_range:
+                housing_type = HousingType.HDB_4_ROOM
+            else:
+                # Default based on donor_type
+                donor_type = form_data.get("donor_type", "individual")
+                if donor_type == "corporate":
+                    housing_type = HousingType.CONDO  # Proxy for corporate
+                elif donor_type == "foundation":
+                    housing_type = HousingType.LANDED  # High value
+                else:
+                    housing_type = HousingType.HDB_4_ROOM
+            # Get causes and infer interests from bio/motivation
+            causes = form_data.get("causes", [])
+            # Extract interests from bio and motivation text
+            bio = form_data.get("bio", "")
+            motivation = form_data.get("motivation", "")
+            combined_text = f"{bio} {motivation}".lower()
+            interest_keywords = {
+                "technology": ["tech", "software", "digital", "innovation", "startup"],
+                "sustainability": [
+                    "green",
+                    "sustainable",
+                    "climate",
+                    "environment",
+                    "eco",
+                ],
+                "finance": ["finance", "banking", "investment", "money", "economic"],
+                "healthcare": ["health", "medical", "hospital", "wellness", "care"],
+                "education": ["education", "school", "learning", "teach", "university"],
+                "community": [
+                    "community",
+                    "local",
+                    "neighborhood",
+                    "social",
+                    "volunteer",
+                ],
+                "arts": ["art", "culture", "music", "creative", "design"],
+            }
+            interests = []
+            for interest, keywords in interest_keywords.items():
+                if any(kw in combined_text for kw in keywords):
+                    interests.append(interest)
+            # Add causes as interests too (overlap is fine)
+            for cause in causes:
+                if cause not in interests:
+                    interests.append(cause)
+            return ClientProfile(
+                user_id=user_id,
+                coordinates=coordinates,
+                planning_area=planning_area,
+                housing_type=housing_type,
+                interests=interests[:5],  # Limit to 5
+                causes=causes,
+                preferred_language=form_data.get("preferred_language", "en"),
+                is_donor=True,  # Came from donor registration
+                total_donated=0,  # Unknown for new donors
+                donation_count=0,
+                age_range=None,
+            )
+        else:
+            # Client form - has GIS data directly
+            return ClientProfile(
+                user_id=user_id,
+                coordinates=tuple(form_data.get("coordinates", [1.3521, 103.8198])),
+                planning_area=form_data.get("planning_area", "central"),
+                housing_type=HousingType(form_data.get("housing_type", "hdb_4_room")),
+                interests=form_data.get("interests", []),
+                causes=form_data.get("causes", []),
+                preferred_language=form_data.get("preferred_language", "en"),
+                is_donor=form_data.get("is_donor", False),
+                total_donated=form_data.get("total_donated", 0),
+                donation_count=form_data.get("donation_count", 0),
+                age_range=form_data.get("age_range"),
+            )
+    async def find_lookalikes(
+        self,
+        seed_profile: ClientProfile,
+        k: int = 50,
+        planning_area_filter: Optional[str] = None,
+        housing_type_filter: Optional[List[HousingType]] = None,
+        use_hybrid: bool = False,
+        fallback_candidates: Optional[List[ClientProfile]] = None,
+    ) -> List[ScoredClient]:
+        """
+        Find top-K lookalikes for a seed donor profile.
+        Args:
+            seed_profile: The "ideal donor" profile to match against
+            k: Number of neighbors to retrieve
+            planning_area_filter: Optional geo-fence filter
+            housing_type_filter: Optional housing type filter
+            use_hybrid: Force hybrid matching (good for small datasets)
+            fallback_candidates: Candidates to use if vector store returns nothing
+        Returns:
+            List of ScoredClient objects ranked by similarity
+        """
+        # Check if we should use hybrid matching
+        if use_hybrid and fallback_candidates:
+            return self.find_lookalikes_hybrid(
+                seed_profile=seed_profile,
+                candidates=fallback_candidates,
+                k=k,
+                planning_area_filter=planning_area_filter,
+                housing_type_filter=housing_type_filter,
+            )
+        if not self.encoder or not self.vector_store:
+            # No encoder/store - use hybrid with fallback candidates
+            if fallback_candidates:
+                return self.find_lookalikes_hybrid(
+                    seed_profile=seed_profile,
+                    candidates=fallback_candidates,
+                    k=k,
+                    planning_area_filter=planning_area_filter,
+                    housing_type_filter=housing_type_filter,
+                )
+            raise ValueError(
+                "Encoder and vector store must be initialized, or provide fallback_candidates"
+            )
+        # Generate embedding for seed profile
+        seed_text = seed_profile.to_embedding_text()
+        seed_embedding = await self.encoder.encode(seed_text)
+        # Query vector store - search for BOTH donors and clients
+        # Donors registered via /donors/register have form_type="donor"
+        # Clients registered via /clients/register have form_type="client"
+        all_results = []
+        # Search for donors first (main source of potential clients for donees)
+        donor_results = await self.vector_store.find_similar(
+            query_embedding=seed_embedding,
+            form_type="donor",
+            limit=k * 2,
+            country_filter="SG",
+        )
+        all_results.extend(donor_results)
+        # Also search for clients (if any registered via client endpoint)
+        client_results = await self.vector_store.find_similar(
+            query_embedding=seed_embedding,
+            form_type="client",
+            limit=k * 2,
+            country_filter="SG",
+        )
+        all_results.extend(client_results)
+        # Deduplicate by ID and sort by distance
+        seen_ids = set()
+        results = []
+        for r in sorted(all_results, key=lambda x: x.distance):
+            if r.id not in seen_ids:
+                seen_ids.add(r.id)
+                results.append(r)
+        scored_clients = []
+        for result in results:
+            # Reconstruct client profile from form_data
+            form_data = result.form_data
+            # Apply planning area filter
+            if planning_area_filter:
+                if form_data.get("planning_area") != planning_area_filter:
+                    continue
+            # Apply housing type filter
+            if housing_type_filter:
+                client_housing = form_data.get("housing_type")
+                if client_housing not in [h.value for h in housing_type_filter]:
+                    continue
+            # Create client profile from form_data
+            # Handle both donor forms (different fields) and client forms
+            client = self._form_data_to_client_profile(
+                result.id, form_data, result.form_type
+            )
+            # Calculate scores
+            vector_score = self.calculate_vector_similarity(result.distance)
+            spatial_score = self.calculate_spatial_proxy_score(client)
+            proximity_score = 0.5  # Default, can be enhanced with event data
+            # Calculate final weighted score
+            final_score = (
+                self.weights["vector_similarity"] * vector_score
+                + self.weights["spatial_proxy"] * spatial_score
+                + self.weights["proximity"] * proximity_score
+            )
+            scored_clients.append(
+                ScoredClient(
+                    client=client,
+                    vector_similarity_score=vector_score,
+                    spatial_proxy_score=spatial_score,
+                    proximity_score=proximity_score,
+                    final_score=final_score,
+                    vector_distance=result.distance,
+                )
+            )
+        # Sort by final score and return top K
+        scored_clients.sort(key=lambda x: x.final_score, reverse=True)
+        return scored_clients[:k]
+    def apply_tiered_targeting(
+        self, clients: List[ScoredClient], min_score: float = 0.0, tiers: int = 3
+    ) -> Dict[str, List[ScoredClient]]:
+        """
+        Apply tiered targeting to segment clients.
+        Returns clients grouped into tiers:
+        - Tier 1: High priority (top third)
+        - Tier 2: Medium priority (middle third)
+        - Tier 3: Lower priority (bottom third)
+        """
+        # Filter by minimum score
+        filtered = [c for c in clients if c.final_score >= min_score]
+        if not filtered:
+            return {"tier_1": [], "tier_2": [], "tier_3": []}
+        # Calculate tier boundaries
+        n = len(filtered)
+        tier_size = n // tiers
+        return {
+            "tier_1": filtered[:tier_size],
+            "tier_2": filtered[tier_size : tier_size * 2],
+            "tier_3": filtered[tier_size * 2 :],
+        }
+    def to_geojson(self, scored_clients: List[ScoredClient]) -> Dict[str, Any]:
+        """
+        Convert scored clients to GeoJSON for map visualization.
+        Note: Coordinates are reduced precision for privacy.
+        """
+        features = []
+        for sc in scored_clients:
+            # Reduce coordinate precision for privacy
+            lat, lng = self.reduce_coordinate_precision(
+                sc.client.coordinates[0], sc.client.coordinates[1]
+            )
+            feature = {
+                "type": "Feature",
+                "geometry": {
+                    "type": "Point",
+                    "coordinates": [lng, lat],  # GeoJSON is [lng, lat]
+                },
+                "properties": {
+                    "user_id": sc.client.user_id,
+                    "planning_area": sc.client.planning_area,
+                    "housing_type": sc.client.housing_type.value,
+                    "causes": sc.client.causes,
+                    "is_donor": sc.client.is_donor,
+                    "final_score": round(sc.final_score, 3),
+                    "vector_similarity": round(sc.vector_similarity_score, 3),
+                    "spatial_proxy": round(sc.spatial_proxy_score, 3),
+                    "proximity": round(sc.proximity_score, 3),
+                    # Exclude PII like name, exact address
+                },
+            }
+            features.append(feature)
+        return {"type": "FeatureCollection", "features": features}
+# ============================================================================
+# Mock Data Generator (for demonstration)
+# ============================================================================
+# Singapore-style names (multi-ethnic: Chinese, Malay, Indian, Eurasian)
+_FIRST_NAMES_CHINESE = [
+    "Wei Ling", "Jia Hui", "Xiu Mei", "Zhi Wei", "Mei Ling", "Jun Jie",
+    "Xiao Ming", "Yu Yan", "Jing Yi", "Zhi Hao", "Hui Min", "Kai Wen",
+    "Shi Min", "Yi Xuan", "Jia Ying", "Wen Hui", "Li Hua", "Xin Yi",
+    "Jia Min", "Zhi Xuan", "Shu Ting", "Wei Jie", "Pei Shan", "Jun Wei",
+]
+_SURNAMES_CHINESE = [
+    "Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Goh", "Chua", "Chan", "Koh",
+    "Teo", "Ang", "Yeo", "Tay", "Ho", "Low", "Sim", "Chong", "Leong", "Foo",
+]
+_FIRST_NAMES_MALAY = [
+    "Ahmad", "Muhammad", "Fatimah", "Siti", "Nur", "Aisyah", "Hafiz",
+    "Amirah", "Farah", "Haziq", "Iman", "Zulkifli", "Rashid", "Nurul",
+    "Hakim", "Syahira", "Irfan", "Liyana", "Danial", "Ain",
+]
+_SURNAMES_MALAY = [
+    "bin Abdullah", "binti Ismail", "bin Rahman", "binti Hassan",
+    "bin Osman", "binti Ahmad", "bin Yusof", "binti Mohamed",
+    "bin Ibrahim", "binti Ali", "bin Hamid", "binti Zainal",
+]
+_FIRST_NAMES_INDIAN = [
+    "Priya", "Raj", "Ananya", "Arjun", "Kavitha", "Suresh", "Deepa",
+    "Vijay", "Lakshmi", "Rahul", "Nirmala", "Sanjay", "Meena", "Arun",
+    "Revathi", "Ganesh", "Shanti", "Kumar", "Devi", "Ravi",
+]
+_SURNAMES_INDIAN = [
+    "Krishnan", "Pillai", "Nair", "Menon", "Rajan", "Sharma", "Patel",
+    "Subramaniam", "Narayanan", "Chandran", "Gopal", "Muthu", "Samy",
+]
+_FIRST_NAMES_EURASIAN = [
+    "Daniel", "Sarah", "Michael", "Rachel", "David", "Michelle", "James",
+    "Vanessa", "Mark", "Stephanie", "Paul", "Amanda", "Brian", "Nicole",
+]
+_SURNAMES_EURASIAN = [
+    "De Souza", "Pereira", "Rodrigues", "Fernandes", "Da Costa",
+    "Oliveira", "Sequeira", "D'Cruz", "Shepherdson", "Westerhout",
+]
+def generate_singapore_name() -> str:
+    """Generate a random Singapore-style name reflecting local demographics."""
+    import random
+    ethnicity = random.choices(
+        ["chinese", "malay", "indian", "eurasian"],
+        weights=[0.74, 0.13, 0.09, 0.04]  # Approximate Singapore demographics
+    )[0]
+    if ethnicity == "chinese":
+        return f"{random.choice(_SURNAMES_CHINESE)} {random.choice(_FIRST_NAMES_CHINESE)}"
+    elif ethnicity == "malay":
+        first = random.choice(_FIRST_NAMES_MALAY)
+        surname = random.choice(_SURNAMES_MALAY)
+        return f"{first} {surname}"
+    elif ethnicity == "indian":
+        return f"{random.choice(_FIRST_NAMES_INDIAN)} {random.choice(_SURNAMES_INDIAN)}"
+    else:
+        return f"{random.choice(_FIRST_NAMES_EURASIAN)} {random.choice(_SURNAMES_EURASIAN)}"
+def generate_mock_clients(n: int = 100) -> List[ClientProfile]:
+    """Generate mock client profiles for testing."""
+    import random
+    used_names: set[str] = set()
+    def get_unique_name() -> str:
+        """Generate a unique Singapore name, adding suffix if needed."""
+        base_name = generate_singapore_name()
+        name = base_name
+        suffix = 1
+        while name in used_names:
+            suffix += 1
+            name = f"{base_name} ({suffix})"
+        used_names.add(name)
+        return name
+    interests_pool = [
+        "technology",
+        "sustainability",
+        "finance",
+        "healthcare",
+        "education",
+        "arts",
+        "sports",
+        "community",
+        "environment",
+        "innovation",
+        "social_impact",
+        "volunteering",
+        "entrepreneurship",
+        "wellness",
+    ]
+    causes_pool = [
+        "education",
+        "health",
+        "environment",
+        "poverty",
+        "children",
+        "elderly",
+        "disability",
+        "animals",
+        "arts",
+        "disaster_relief",
+        "human_rights",
+        "technology",
+        "housing",
+    ]
+    languages = ["en", "zh", "ms", "ta", "th", "vi"]
+    age_ranges = ["18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
+    housing_types = list(HousingType)
+    planning_areas = list(PLANNING_AREAS.keys())
+    clients = []
+    for i in range(n):
+        # Select random planning area and add some noise to coordinates
+        area_key = random.choice(planning_areas)
+        area = PLANNING_AREAS[area_key]
+        # Add small random offset (within ~500m)
+        lat = area["lat"] + random.uniform(-0.005, 0.005)
+        lng = area["lng"] + random.uniform(-0.005, 0.005)
+        # Weighted housing type selection (more HDB in Singapore)
+        housing_weights = [0.05, 0.15, 0.25, 0.2, 0.1, 0.15, 0.08, 0.02]
+        housing = random.choices(housing_types, weights=housing_weights)[0]
+        # Random interests and causes
+        interests = random.sample(interests_pool, random.randint(2, 5))
+        causes = random.sample(causes_pool, random.randint(1, 4))
+        # Donor status (30% are donors)
+        is_donor = random.random() < 0.3
+        client = ClientProfile(
+            user_id=generate_singapore_name(),
+            coordinates=(round(lat, 4), round(lng, 4)),
+            planning_area=area_key,
+            housing_type=housing,
+            interests=interests,
+            causes=causes,
+            preferred_language=random.choice(languages),
+            is_donor=is_donor,
+            total_donated=random.uniform(50, 5000) if is_donor else 0,
+            donation_count=random.randint(1, 20) if is_donor else 0,
+            age_range=random.choice(age_ranges),
+        )
+        # Generate fallback embedding and compute reduced versions
+        recommender = GISRecommender()
+        client.embedding = recommender._generate_fallback_embedding(client)
+        client.compute_reduced_embeddings()
+        clients.append(client)
+    return clients
+def generate_seed_donor_profile(cause: str = "education") -> ClientProfile:
+    """Generate an ideal donor profile for lookalike search."""
+    profile = ClientProfile(
+        user_id="seed_donor",
+        coordinates=(1.3048, 103.8318),  # Orchard area
+        planning_area="orchard",
+        housing_type=HousingType.CONDO,
+        interests=["sustainability", "social_impact", "community"],
+        causes=[cause, "children"],
+        preferred_language="en",
+        is_donor=True,
+        total_donated=2500.0,
+        donation_count=12,
+        age_range="35-44",
+    )
+    # Generate fallback embedding and compute reduced versions
+    recommender = GISRecommender()
+    profile.embedding = recommender._generate_fallback_embedding(profile)
+    profile.compute_reduced_embeddings()
+    return profile

recommender/vector_store.py ADDED Viewed

	@@ -0,0 +1,404 @@

+"""Vector storage and retrieval for donor/volunteer embeddings.
+Uses the existing my_embeddings table in Supabase with pgvector extension.
+"""
+import json
+from typing import List, Optional, Dict, Any, Union
+from dataclasses import dataclass
+import numpy as np
+def _parse_json_field(value: Union[str, dict, None]) -> dict:
+    """Safely parse a JSON field that might already be a dict (psycopg3 auto-parses)."""
+    if value is None:
+        return {}
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, str):
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+@dataclass
+class SimilarityResult:
+    """Result from similarity search.
+    Attributes:
+        id: The source_id of the matched form.
+        form_data: The original form data as a dictionary.
+        score: Similarity score (higher is more similar).
+        form_type: Type of form ("donor" or "volunteer").
+        distance: Raw L2 distance from query.
+    """
+    id: str
+    form_data: Dict[str, Any]
+    score: float
+    form_type: str
+    distance: float = 0.0
+class DonorVectorStore:
+    """Vector storage and retrieval for donor/volunteer embeddings.
+    Uses the existing my_embeddings table schema:
+    - source_id: form ID
+    - chunk_index: always 0 (single embedding per form)
+    - text_content: JSON serialized form data
+    - metadata: {"form_type": "donor"|"volunteer", ...}
+    - embedding: VECTOR(1024)
+    Attributes:
+        pool: AsyncConnectionPool for database connections.
+    """
+    def __init__(self, pool):
+        """Initialize vector store.
+        Args:
+            pool: AsyncConnectionPool from psycopg_pool
+        """
+        self.pool = pool
+    async def store_embedding(
+        self,
+        form_id: str,
+        form_type: str,
+        embedding: np.ndarray,
+        form_data: Dict[str, Any]
+    ) -> int:
+        """Store form embedding in my_embeddings table.
+        Args:
+            form_id: Unique identifier for the form.
+            form_type: Type of form ("donor" or "volunteer").
+            embedding: The 1024-dimensional embedding vector.
+            form_data: Original form data to store.
+        Returns:
+            The database ID of the inserted record.
+        """
+        embedding_list = embedding.tolist()
+        form_json = json.dumps(form_data, default=str)
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute(
+                    """
+                    INSERT INTO my_embeddings
+                    (source_id, chunk_index, text_content, metadata, embedding)
+                    VALUES (%s, %s, %s, %s, %s::vector)
+                    RETURNING id
+                    """,
+                    (
+                        form_id,
+                        0,  # Single embedding per form
+                        form_json,
+                        json.dumps({"form_type": form_type}),
+                        embedding_list
+                    )
+                )
+                result = await cur.fetchone()
+                return result[0]
+    async def update_embedding(
+        self,
+        form_id: str,
+        embedding: np.ndarray,
+        form_data: Optional[Dict[str, Any]] = None
+    ) -> bool:
+        """Update an existing embedding.
+        Args:
+            form_id: The form ID to update.
+            embedding: New embedding vector.
+            form_data: Optional updated form data.
+        Returns:
+            True if update succeeded, False if record not found.
+        """
+        embedding_list = embedding.tolist()
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                if form_data:
+                    form_json = json.dumps(form_data, default=str)
+                    await cur.execute(
+                        """
+                        UPDATE my_embeddings
+                        SET embedding = %s::vector, text_content = %s
+                        WHERE source_id = %s
+                        """,
+                        (embedding_list, form_json, form_id)
+                    )
+                else:
+                    await cur.execute(
+                        """
+                        UPDATE my_embeddings
+                        SET embedding = %s::vector
+                        WHERE source_id = %s
+                        """,
+                        (embedding_list, form_id)
+                    )
+                return cur.rowcount > 0
+    async def delete_embedding(self, form_id: str) -> bool:
+        """Delete an embedding by form ID.
+        Args:
+            form_id: The form ID to delete.
+        Returns:
+            True if deletion succeeded, False if record not found.
+        """
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute(
+                    "DELETE FROM my_embeddings WHERE source_id = %s",
+                    (form_id,)
+                )
+                return cur.rowcount > 0
+    async def get_embedding(self, form_id: str) -> Optional[SimilarityResult]:
+        """Get a specific embedding by form ID.
+        Args:
+            form_id: The form ID to retrieve.
+        Returns:
+            SimilarityResult if found, None otherwise.
+        """
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute(
+                    """
+                    SELECT source_id, text_content, metadata
+                    FROM my_embeddings
+                    WHERE source_id = %s
+                    """,
+                    (form_id,)
+                )
+                row = await cur.fetchone()
+                if not row:
+                    return None
+                form_data = _parse_json_field(row[1])
+                metadata = _parse_json_field(row[2])
+                return SimilarityResult(
+                    id=row[0],
+                    form_data=form_data,
+                    form_type=metadata.get("form_type", "unknown"),
+                    score=1.0,
+                    distance=0.0,
+                )
+    async def find_similar(
+        self,
+        query_embedding: np.ndarray,
+        form_type: Optional[str] = None,
+        limit: int = 10,
+        country_filter: Optional[str] = None,
+        exclude_ids: Optional[List[str]] = None
+    ) -> List[SimilarityResult]:
+        """Find similar donors/volunteers using vector similarity.
+        Uses L2 distance (Euclidean) with IVFFlat index for efficient search.
+        Args:
+            query_embedding: The query embedding vector.
+            form_type: Optional filter for "donor" or "volunteer".
+            limit: Maximum number of results to return.
+            country_filter: Optional filter for country code.
+            exclude_ids: Optional list of form IDs to exclude.
+        Returns:
+            List of SimilarityResult ordered by similarity (highest first).
+        """
+        embedding_list = query_embedding.tolist()
+        # Build query with optional filters
+        query = """
+            SELECT
+                source_id,
+                text_content,
+                metadata,
+                embedding <-> %s::vector AS distance
+            FROM my_embeddings
+            WHERE 1=1
+        """
+        params: List[Any] = [embedding_list]
+        if form_type:
+            query += " AND metadata->>'form_type' = %s"
+            params.append(form_type)
+        if country_filter:
+            query += " AND text_content ILIKE %s"
+            params.append(f'%"country": "{country_filter}"%')
+        if exclude_ids:
+            placeholders = ", ".join(["%s"] * len(exclude_ids))
+            query += f" AND source_id NOT IN ({placeholders})"
+            params.extend(exclude_ids)
+        query += " ORDER BY distance ASC LIMIT %s"
+        params.append(limit)
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute(query, params)
+                rows = await cur.fetchall()
+        results = []
+        for row in rows:
+            form_data = _parse_json_field(row[1])
+            metadata = _parse_json_field(row[2])
+            distance = float(row[3])
+            results.append(SimilarityResult(
+                id=row[0],
+                form_data=form_data,
+                form_type=metadata.get("form_type", "unknown"),
+                score=1.0 / (1.0 + distance),  # Convert distance to similarity
+                distance=distance
+            ))
+        return results
+    async def find_by_causes(
+        self,
+        target_causes: List[str],
+        query_embedding: np.ndarray,
+        limit: int = 20
+    ) -> List[SimilarityResult]:
+        """Hybrid search: filter by causes, rank by embedding similarity.
+        Combines keyword filtering with vector similarity for better
+        recommendations when specific causes are targeted.
+        Args:
+            target_causes: List of cause categories to match.
+            query_embedding: The query embedding for ranking.
+            limit: Maximum number of results to return.
+        Returns:
+            List of SimilarityResult matching causes, ranked by similarity.
+        """
+        embedding_list = query_embedding.tolist()
+        # Build ILIKE clauses for cause filtering
+        cause_conditions = " OR ".join([
+            "text_content ILIKE %s" for _ in target_causes
+        ])
+        cause_params = [f"%{cause}%" for cause in target_causes]
+        query = f"""
+            SELECT
+                source_id,
+                text_content,
+                metadata,
+                embedding <-> %s::vector AS distance
+            FROM my_embeddings
+            WHERE ({cause_conditions})
+            ORDER BY distance ASC
+            LIMIT %s
+        """
+        params = [embedding_list] + cause_params + [limit]
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute(query, params)
+                rows = await cur.fetchall()
+        results = []
+        for row in rows:
+            form_data = _parse_json_field(row[1])
+            metadata = _parse_json_field(row[2])
+            distance = float(row[3])
+            results.append(SimilarityResult(
+                id=row[0],
+                form_data=form_data,
+                form_type=metadata.get("form_type", "unknown"),
+                score=1.0 / (1.0 + distance),
+                distance=distance
+            ))
+        return results
+    async def count_by_type(self) -> Dict[str, int]:
+        """Get count of embeddings by form type.
+        Returns:
+            Dictionary with counts: {"donor": N, "volunteer": M, "total": N+M}
+        """
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute("""
+                    SELECT
+                        metadata->>'form_type' as form_type,
+                        COUNT(*) as count
+                    FROM my_embeddings
+                    GROUP BY metadata->>'form_type'
+                """)
+                rows = await cur.fetchall()
+        counts = {"donor": 0, "volunteer": 0, "total": 0}
+        for row in rows:
+            form_type = row[0] or "unknown"
+            count = row[1]
+            if form_type in counts:
+                counts[form_type] = count
+            counts["total"] += count
+        return counts
+    async def find_by_form_type(
+        self, form_type: str, limit: int = 500
+    ) -> List[SimilarityResult]:
+        """Get all entries of a specific form type.
+        Args:
+            form_type: Type of form ("donor", "volunteer", or "client").
+            limit: Maximum number of results to return.
+        Returns:
+            List of SimilarityResult for the specified form type.
+        """
+        query = """
+            SELECT
+                source_id,
+                text_content,
+                metadata
+            FROM my_embeddings
+            WHERE metadata->>'form_type' = %s
+            LIMIT %s
+        """
+        async with self.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute(query, (form_type, limit))
+                rows = await cur.fetchall()
+        results = []
+        for row in rows:
+            form_data = _parse_json_field(row[1])
+            metadata = _parse_json_field(row[2])
+            results.append(
+                SimilarityResult(
+                    id=row[0],
+                    form_data=form_data,
+                    form_type=metadata.get("form_type", form_type),
+                    score=1.0,
+                    distance=0.0,
+                )
+            )
+        return results

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+fastapi>=0.109.0
+uvicorn>=0.27.0
+pydantic>=2.0.0
+python-dotenv
+# LangGraph and LangChain
+langchain
+langchain-core
+langchain-ollama
+langgraph
+langgraph-checkpoint-postgres
+# Database
+psycopg[binary,pool]>=3.1.0
+# SeaLion encoder
+httpx>=0.24.0
+numpy>=1.24.0
+# OpenAI (for charity web search tools)
+openai>=1.0.0

test_agentic_rag.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""Test script for Agentic RAG functionality."""
+import asyncio
+import os
+import sys
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+# Load environment variables
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+async def test_rag_tools():
+    """Test individual RAG tools."""
+    print("\n" + "=" * 60)
+    print("Testing Agentic RAG Tools")
+    print("=" * 60)
+    from tools.rag_tools import (
+        RAG_TOOLS,
+        set_rag_dependencies,
+        list_available_categories,
+        get_statistics,
+        semantic_search,
+    )
+    # Check available tools
+    print("\n📦 Available RAG Tools:")
+    for tool in RAG_TOOLS:
+        print(f"  - {tool.name}: {tool.description[:60]}...")
+    # Initialize dependencies
+    print("\n🔧 Initializing dependencies...")
+    try:
+        from encoders.sealion import SeaLionEncoder
+        from recommender.vector_store import DonorVectorStore
+        from psycopg_pool import AsyncConnectionPool
+        # Check for required env vars
+        sealion_endpoint = os.getenv("SEALION_ENDPOINT")
+        db_host = os.getenv("SUPABASE_DB_HOST")
+        if not sealion_endpoint:
+            print("  ⚠️  SEALION_ENDPOINT not set, skipping live tests")
+            return
+        if not db_host:
+            print("  ⚠️  Database credentials not set, skipping live tests")
+            return
+        # Initialize encoder
+        encoder = SeaLionEncoder(endpoint_url=sealion_endpoint)
+        print(f"  ✅ SeaLion encoder initialized (dim: {encoder.embedding_dimension})")
+        # Initialize database pool
+        db_port = os.getenv("SUPABASE_DB_PORT", "6543")
+        db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
+        db_user = os.getenv("SUPABASE_DB_USER")
+        db_password = os.getenv("SUPABASE_DB_PASSWORD")
+        db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
+        conn_string = (
+            f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
+            f"?sslmode={db_sslmode}"
+        )
+        pool = AsyncConnectionPool(
+            conninfo=conn_string,
+            max_size=5,
+            kwargs={"autocommit": True, "prepare_threshold": None},
+        )
+        await pool.open()
+        print("  ✅ Database pool connected")
+        vector_store = DonorVectorStore(pool)
+        print("  ✅ Vector store initialized")
+        # Set dependencies for tools
+        set_rag_dependencies(encoder, vector_store)
+        print("  ✅ RAG tools configured")
+        # Test list_available_categories
+        print("\n📊 Testing list_available_categories...")
+        categories_result = await list_available_categories.ainvoke({})
+        print(f"  Result: {categories_result[:200]}...")
+        # Test get_statistics
+        print("\n📈 Testing get_statistics...")
+        stats_result = await get_statistics.ainvoke({})
+        print(f"  Result: {stats_result}")
+        # Test semantic_search (if there's data)
+        print("\n🔍 Testing semantic_search...")
+        search_result = await semantic_search.ainvoke({
+            "query": "education donors in Singapore",
+            "limit": 3
+        })
+        print(f"  Result: {search_result[:300]}...")
+        # Cleanup
+        await pool.close()
+        print("\n✅ All tool tests completed!")
+    except Exception as e:
+        import traceback
+        print(f"  ❌ Error: {e}")
+        traceback.print_exc()
+async def test_agentic_rag_agent():
+    """Test the full Agentic RAG agent."""
+    print("\n" + "=" * 60)
+    print("Testing Agentic RAG Agent")
+    print("=" * 60)
+    try:
+        from agents.agentic_rag import AgenticRAGAgent
+        from encoders.sealion import SeaLionEncoder
+        from recommender.vector_store import DonorVectorStore
+        from psycopg_pool import AsyncConnectionPool
+        from langchain_ollama import ChatOllama
+        # Check for required env vars
+        sealion_endpoint = os.getenv("SEALION_ENDPOINT")
+        db_host = os.getenv("SUPABASE_DB_HOST")
+        ollama_api_key = os.getenv("OLLAMA_API_KEY")
+        if not all([sealion_endpoint, db_host]):
+            print("  ⚠️  Required environment variables not set, skipping agent test")
+            return
+        print("\n🔧 Initializing agent components...")
+        # Initialize encoder
+        encoder = SeaLionEncoder(endpoint_url=sealion_endpoint)
+        # Initialize database
+        db_port = os.getenv("SUPABASE_DB_PORT", "6543")
+        db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
+        db_user = os.getenv("SUPABASE_DB_USER")
+        db_password = os.getenv("SUPABASE_DB_PASSWORD")
+        db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
+        conn_string = (
+            f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
+            f"?sslmode={db_sslmode}"
+        )
+        pool = AsyncConnectionPool(
+            conninfo=conn_string,
+            max_size=5,
+            kwargs={"autocommit": True, "prepare_threshold": None},
+        )
+        await pool.open()
+        vector_store = DonorVectorStore(pool)
+        # Initialize LLM
+        if ollama_api_key:
+            llm = ChatOllama(
+                model="gpt-oss:120b",
+                base_url="https://ollama.com",
+                client_kwargs={
+                    "headers": {"Authorization": f"Bearer {ollama_api_key}"}
+                }
+            )
+        else:
+            llm = ChatOllama(model="gpt-oss:120b-cloud")
+        print("  ✅ All components initialized")
+        # Create agent
+        agent = AgenticRAGAgent(llm, encoder, vector_store)
+        print("  ✅ Agentic RAG agent created")
+        # Test a query
+        print("\n🤖 Running agent query: 'Find donors interested in education'")
+        print("-" * 40)
+        result = await agent.search("Find donors interested in education in Singapore")
+        print(f"\n📝 Response:\n{result['response'][:500]}...")
+        print(f"\n🔧 Tool calls made: {len(result['tool_calls'])}")
+        for tc in result['tool_calls']:
+            print(f"  - {tc['tool']}: {tc['args']}")
+        print(f"\n📊 Total messages: {result['message_count']}")
+        # Cleanup
+        await pool.close()
+        print("\n✅ Agent test completed!")
+    except Exception as e:
+        import traceback
+        print(f"  ❌ Error: {e}")
+        traceback.print_exc()
+async def main():
+    """Run all tests."""
+    print("\n🚀 Agentic RAG Test Suite")
+    print("=" * 60)
+    await test_rag_tools()
+    await test_agentic_rag_agent()
+    print("\n" + "=" * 60)
+    print("All tests completed!")
+    print("=" * 60)
+if __name__ == "__main__":
+    # Windows async fix
+    if sys.platform == "win32":
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+    asyncio.run(main())

test_api.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Test the lookalike API endpoint."""
+import asyncio
+from app import find_lookalike_clients, LookalikeRequest
+async def test_lookalike_endpoint():
+    """Test the lookalike endpoint with hybrid matching."""
+    print("=" * 60)
+    print("Testing Lookalike API Endpoint")
+    print("=" * 60)
+    # Test 1: Basic request
+    print("\nTest 1: Basic request with education cause")
+    req = LookalikeRequest(
+        seed_causes=["education"],
+        seed_interests=["sustainability"],
+        limit=15,
+        include_geojson=True,
+    )
+    result = await find_lookalike_clients(req)
+    print(f"Total found: {result.total_found}")
+    print(
+        f"Tiers: T1={len(result.tiers['tier_1'])}, T2={len(result.tiers['tier_2'])}, T3={len(result.tiers['tier_3'])}"
+    )
+    print(
+        f"GeoJSON features: {len(result.geojson['features']) if result.geojson else 0}"
+    )
+    if result.tiers["tier_1"]:
+        top = result.tiers["tier_1"][0]
+        print(f"\nTop match: {top.user_id}")
+        print(f"  Score: {top.final_score:.3f}")
+        print(f"  Causes: {top.causes}")
+        print(f"  Area: {top.planning_area}")
+    # Test 2: With planning area filter
+    print("\n" + "-" * 60)
+    print("Test 2: With planning area filter (bishan)")
+    req2 = LookalikeRequest(
+        seed_causes=["education", "children"],
+        seed_interests=["community"],
+        planning_area_filter="bishan",
+        limit=10,
+        include_geojson=False,
+    )
+    result2 = await find_lookalike_clients(req2)
+    print(f"Total found in Bishan: {result2.total_found}")
+    # Test 3: With housing type filter
+    print("\n" + "-" * 60)
+    print("Test 3: With housing type filter (condo, landed)")
+    req3 = LookalikeRequest(
+        seed_causes=["environment"],
+        seed_interests=["technology"],
+        housing_type_filter=["condo", "landed"],
+        limit=10,
+        include_geojson=False,
+    )
+    result3 = await find_lookalike_clients(req3)
+    print(f"Total found (high-income housing): {result3.total_found}")
+    for client in result3.tiers["tier_1"][:3]:
+        print(
+            f"  - {client.user_id}: {client.housing_type}, score={client.final_score:.3f}"
+        )
+    # Test 4: Low minimum score to get all matches
+    print("\n" + "-" * 60)
+    print("Test 4: Relaxed min_score (0.0)")
+    req4 = LookalikeRequest(
+        seed_causes=["health"],
+        seed_interests=[],
+        min_score=0.0,
+        limit=30,
+        include_geojson=True,
+    )
+    result4 = await find_lookalike_clients(req4)
+    print(f"Total found: {result4.total_found}")
+    print(
+        f"Score range: {min(c.final_score for t in result4.tiers.values() for c in t):.3f} - {max(c.final_score for t in result4.tiers.values() for c in t):.3f}"
+    )
+    print("\n" + "=" * 60)
+    print("All API tests passed!")
+    print("=" * 60)
+if __name__ == "__main__":
+    asyncio.run(test_lookalike_endpoint())

test_gis.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""Test the GIS recommender with dimensionality reduction."""
+import numpy as np
+from recommender.gis_recommender import (
+    EmbeddingReducer,
+    HybridSemanticSpatialEncoder,
+    generate_mock_clients,
+    generate_seed_donor_profile,
+    GISRecommender,
+    ClientProfile,
+    HousingType,
+)
+def test_embedding_reducer():
+    """Test the embedding reducer."""
+    print("=" * 50)
+    print("Testing EmbeddingReducer")
+    print("=" * 50)
+    # Create sample sparse embeddings (like SEA-LION output)
+    sample_embedding = np.zeros(1024)
+    sample_embedding[0] = 0.206
+    sample_embedding[1] = -0.198
+    sample_embedding[10] = 0.178
+    sample_embedding[50] = -0.145
+    sample_embedding[100] = 0.234
+    sample_embedding[200] = -0.167
+    sample_embedding[500] = 0.189
+    sample_embedding[800] = -0.156
+    # Test sparse projection
+    reduced = EmbeddingReducer.compute_sparse_projection(
+        sample_embedding, n_components=8
+    )
+    print(f"Original dims: {len(sample_embedding)}")
+    print(f"Reduced dims: {len(reduced)}")
+    print(f"Reduced values: {reduced}")
+    print(f"Reduced norm: {np.linalg.norm(reduced):.4f}")
+    print()
+def test_hybrid_encoder():
+    """Test the hybrid semantic-spatial encoder."""
+    print("=" * 50)
+    print("Testing HybridSemanticSpatialEncoder")
+    print("=" * 50)
+    encoder = HybridSemanticSpatialEncoder(semantic_dims=8)
+    # Test with sample embedding and Singapore coordinates
+    embedding = np.random.randn(1024)
+    coords = (1.3521, 103.8198)  # Singapore center
+    hybrid = encoder.encode(embedding, coords)
+    print(f"Hybrid vector dims: {len(hybrid)}")  # Should be 8 + 2 = 10
+    print(f"Hybrid values: {hybrid}")
+    # Test similarity between nearby points
+    coords2 = (1.3525, 103.8195)  # Very close
+    hybrid2 = encoder.encode(embedding, coords2)
+    sim_same = encoder.compute_similarity(hybrid, hybrid2)
+    print(f"Similarity (same embedding, close coords): {sim_same:.4f}")
+    # Test with different embedding
+    embedding3 = np.random.randn(1024)
+    hybrid3 = encoder.encode(embedding3, coords)
+    sim_diff = encoder.compute_similarity(hybrid, hybrid3)
+    print(f"Similarity (diff embedding, same coords): {sim_diff:.4f}")
+    print()
+def test_mock_clients():
+    """Test mock client generation with embeddings."""
+    print("=" * 50)
+    print("Testing Mock Client Generation")
+    print("=" * 50)
+    seed = generate_seed_donor_profile("education")
+    print(f"Seed profile: {seed.user_id}")
+    print(f"  - Causes: {seed.causes}")
+    print(f"  - Full embedding dims: {len(seed.embedding)}")
+    print(f"  - Reduced embedding dims: {len(seed.embedding_reduced)}")
+    print(f"  - Hybrid embedding dims: {len(seed.hybrid_embedding)}")
+    print()
+    clients = generate_mock_clients(10)
+    print(f"Generated {len(clients)} mock clients")
+    for i, c in enumerate(clients[:3]):
+        print(f"  Client {i}: {c.user_id}")
+        print(f"    - Area: {c.planning_area}, Housing: {c.housing_type.value}")
+        print(f"    - Causes: {c.causes}")
+        print(
+            f"    - Has embeddings: full={c.embedding is not None}, reduced={c.embedding_reduced is not None}"
+        )
+    print()
+def test_hybrid_lookalike():
+    """Test hybrid lookalike matching."""
+    print("=" * 50)
+    print("Testing Hybrid Lookalike Matching")
+    print("=" * 50)
+    seed = generate_seed_donor_profile("education")
+    candidates = generate_mock_clients(50)
+    recommender = GISRecommender()
+    # Find lookalikes without filters
+    results = recommender.find_lookalikes_hybrid(
+        seed_profile=seed,
+        candidates=candidates,
+        k=10,
+    )
+    print(f"Found {len(results)} lookalikes")
+    print("\nTop 5 matches:")
+    for i, r in enumerate(results[:5]):
+        print(f"  {i+1}. {r.client.user_id}")
+        print(
+            f"     Score: {r.final_score:.3f} (vector={r.vector_similarity_score:.3f}, spatial={r.spatial_proxy_score:.3f}, prox={r.proximity_score:.3f})"
+        )
+        print(f"     Causes: {r.client.causes}")
+        print(f"     Distance: {r.geo_distance_km:.2f} km")
+    print()
+    # Test with planning area filter
+    print("\nWith planning area filter (bishan):")
+    results_filtered = recommender.find_lookalikes_hybrid(
+        seed_profile=seed,
+        candidates=candidates,
+        k=10,
+        planning_area_filter="bishan",
+    )
+    print(f"Found {len(results_filtered)} matches in Bishan")
+    for r in results_filtered[:3]:
+        print(f"  - {r.client.user_id}: {r.final_score:.3f}")
+def test_tiered_targeting():
+    """Test tiered targeting."""
+    print("=" * 50)
+    print("Testing Tiered Targeting")
+    print("=" * 50)
+    seed = generate_seed_donor_profile("education")
+    candidates = generate_mock_clients(100)
+    recommender = GISRecommender()
+    results = recommender.find_lookalikes_hybrid(
+        seed_profile=seed,
+        candidates=candidates,
+        k=30,
+    )
+    tiered = recommender.apply_tiered_targeting(results, min_score=0.0)
+    print(f"Tier 1 (High Priority): {len(tiered['tier_1'])} clients")
+    print(f"Tier 2 (Medium Priority): {len(tiered['tier_2'])} clients")
+    print(f"Tier 3 (Lower Priority): {len(tiered['tier_3'])} clients")
+    if tiered["tier_1"]:
+        print(
+            f"\nTier 1 score range: {tiered['tier_1'][-1].final_score:.3f} - {tiered['tier_1'][0].final_score:.3f}"
+        )
+    if tiered["tier_3"]:
+        print(
+            f"Tier 3 score range: {tiered['tier_3'][-1].final_score:.3f} - {tiered['tier_3'][0].final_score:.3f}"
+        )
+def test_geojson_export():
+    """Test GeoJSON export."""
+    print("=" * 50)
+    print("Testing GeoJSON Export")
+    print("=" * 50)
+    seed = generate_seed_donor_profile("education")
+    candidates = generate_mock_clients(20)
+    recommender = GISRecommender()
+    results = recommender.find_lookalikes_hybrid(seed, candidates, k=10)
+    geojson = recommender.to_geojson(results)
+    print(f"GeoJSON type: {geojson['type']}")
+    print(f"Number of features: {len(geojson['features'])}")
+    if geojson["features"]:
+        feat = geojson["features"][0]
+        print(f"\nSample feature:")
+        print(
+            f"  Geometry: {feat['geometry']['type']} at {feat['geometry']['coordinates']}"
+        )
+        print(f"  Properties: {list(feat['properties'].keys())}")
+if __name__ == "__main__":
+    test_embedding_reducer()
+    test_hybrid_encoder()
+    test_mock_clients()
+    test_hybrid_lookalike()
+    test_tiered_targeting()
+    test_geojson_export()
+    print("\n" + "=" * 50)
+    print("All tests passed!")
+    print("=" * 50)

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Tools for LangGraph agents."""
+from .web_search import (
+    search_charity_info,
+    search_charity_ratings,
+    search_charity_comprehensive,
+    CHARITY_SEARCH_TOOLS,
+    openai_web_search,
+    clear_search_cache,
+)
+from .rag_tools import (
+    semantic_search,
+    filter_by_metadata,
+    get_document_by_id,
+    list_available_categories,
+    hybrid_search,
+    get_statistics,
+    RAG_TOOLS,
+    set_rag_dependencies,
+)
+__all__ = [
+    # Web search tools
+    "search_charity_info",
+    "search_charity_ratings",
+    "search_charity_comprehensive",
+    "CHARITY_SEARCH_TOOLS",
+    "openai_web_search",
+    "clear_search_cache",
+    # RAG tools
+    "semantic_search",
+    "filter_by_metadata",
+    "get_document_by_id",
+    "list_available_categories",
+    "hybrid_search",
+    "get_statistics",
+    "RAG_TOOLS",
+    "set_rag_dependencies",
+]

tools/rag_tools.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""Agentic RAG tools for autonomous vector store exploration.
+This module provides tools that allow an agent to autonomously:
+1. Search semantically across the vector store
+2. Filter by metadata fields
+3. Retrieve specific documents
+4. List available categories
+5. Perform hybrid search with filters
+The agent uses a ReAct loop to iteratively explore and refine its search.
+"""
+import json
+from typing import Optional, Dict, Any, List
+from langchain_core.tools import tool
+from functools import wraps
+# Global references to be set at initialization
+_encoder = None
+_vector_store = None
+def set_rag_dependencies(encoder, vector_store):
+    """Set the encoder and vector store instances for RAG tools.
+    Args:
+        encoder: The SeaLion encoder instance
+        vector_store: The DonorVectorStore instance
+    """
+    global _encoder, _vector_store
+    _encoder = encoder
+    _vector_store = vector_store
+def _format_results(results: List[Any], include_details: bool = True) -> str:
+    """Format search results for agent consumption.
+    Args:
+        results: List of SimilarityResult objects
+        include_details: Whether to include full form data
+    Returns:
+        Formatted string representation of results
+    """
+    if not results:
+        return "No results found."
+    formatted = []
+    for i, result in enumerate(results, 1):
+        entry = {
+            "rank": i,
+            "id": result.id,
+            "form_type": result.form_type,
+            "similarity_score": round(result.score, 4),
+        }
+        if include_details and result.form_data:
+            # Extract key fields for readability
+            form_data = result.form_data
+            entry["name"] = form_data.get("name", "Unknown")
+            entry["country"] = form_data.get("country", "Unknown")
+            entry["causes"] = form_data.get("causes", [])
+            # Include type-specific fields
+            if result.form_type == "donor":
+                entry["donor_type"] = form_data.get("donor_type", "Unknown")
+                entry["donation_frequency"] = form_data.get("donation_frequency")
+            elif result.form_type == "volunteer":
+                entry["volunteer_type"] = form_data.get("volunteer_type", "Unknown")
+                entry["skills"] = form_data.get("skills", [])
+                entry["availability"] = form_data.get("availability")
+        formatted.append(entry)
+    return json.dumps(formatted, indent=2, default=str)
+@tool
+async def semantic_search(query: str, limit: int = 5, form_type: Optional[str] = None) -> str:
+    """Search documents by semantic similarity.
+    Use this to find donors/volunteers whose profiles match a natural language query.
+    The search uses vector embeddings to find semantically similar entries.
+    Args:
+        query: Natural language description of what you're looking for.
+               Examples: "corporate donors interested in education",
+                        "volunteers with tech skills in Singapore"
+        limit: Maximum number of results to return (default: 5, max: 20)
+        form_type: Optional filter - "donor" or "volunteer"
+    Returns:
+        JSON formatted list of matching profiles with similarity scores
+    """
+    print(f"[Agentic RAG] semantic_search called - query: '{query}', limit: {limit}, form_type: {form_type}")
+    if _encoder is None or _vector_store is None:
+        return "Error: RAG tools not initialized. Call set_rag_dependencies first."
+    try:
+        # Encode the query
+        embedding = await _encoder.encode(query)
+        # Search the vector store
+        results = await _vector_store.find_similar(
+            query_embedding=embedding,
+            form_type=form_type,
+            limit=min(limit, 20)
+        )
+        return _format_results(results)
+    except Exception as e:
+        return f"Search error: {str(e)}"
+@tool
+async def filter_by_metadata(
+    field: str,
+    value: str,
+    limit: int = 10
+) -> str:
+    """Browse documents filtered by metadata field.
+    Use this to find all entries matching a specific metadata value.
+    Useful for exploring what's available before doing semantic search.
+    Args:
+        field: The metadata field to filter on.
+               Valid fields: "form_type", "donor_type", "volunteer_type",
+                           "country", "availability"
+        value: The value to match.
+               Examples: form_type="donor", country="SG", donor_type="corporate"
+        limit: Maximum number of results (default: 10)
+    Returns:
+        JSON formatted list of matching entries
+    """
+    print(f"[Agentic RAG] filter_by_metadata called - field: '{field}', value: '{value}', limit: {limit}")
+    if _vector_store is None:
+        return "Error: RAG tools not initialized."
+    try:
+        # Map field to actual database query approach
+        if field == "form_type":
+            results = await _vector_store.find_by_form_type(value, limit=limit)
+        else:
+            # For other fields, we need to search through text_content
+            # Use a raw query approach
+            async with _vector_store.pool.connection() as conn:
+                async with conn.cursor() as cur:
+                    # Build ILIKE pattern for JSON field search
+                    pattern = f'%"{field}": "{value}"%'
+                    await cur.execute(
+                        """
+                        SELECT source_id, text_content, metadata
+                        FROM my_embeddings
+                        WHERE text_content ILIKE %s
+                        LIMIT %s
+                        """,
+                        (pattern, limit)
+                    )
+                    rows = await cur.fetchall()
+            # Convert to SimilarityResult-like format
+            from recommender.vector_store import SimilarityResult, _parse_json_field
+            results = []
+            for row in rows:
+                form_data = _parse_json_field(row[1])
+                metadata = _parse_json_field(row[2])
+                results.append(SimilarityResult(
+                    id=row[0],
+                    form_data=form_data,
+                    form_type=metadata.get("form_type", "unknown"),
+                    score=1.0,
+                    distance=0.0
+                ))
+        return _format_results(results)
+    except Exception as e:
+        return f"Filter error: {str(e)}"
+@tool
+async def get_document_by_id(doc_id: str) -> str:
+    """Retrieve a specific document by ID for deeper inspection.
+    Use this when you've identified a promising result from search
+    and want to see the complete profile details.
+    Args:
+        doc_id: The unique document/form ID (e.g., "donor_12345")
+    Returns:
+        Complete JSON representation of the document
+    """
+    print(f"[Agentic RAG] get_document_by_id called - doc_id: '{doc_id}'")
+    if _vector_store is None:
+        return "Error: RAG tools not initialized."
+    try:
+        result = await _vector_store.get_embedding(doc_id)
+        if result is None:
+            return f"Document with ID '{doc_id}' not found."
+        # Return full document details
+        document = {
+            "id": result.id,
+            "form_type": result.form_type,
+            "data": result.form_data
+        }
+        return json.dumps(document, indent=2, default=str)
+    except Exception as e:
+        return f"Retrieval error: {str(e)}"
+@tool
+async def list_available_categories() -> str:
+    """List all unique values for filterable fields.
+    Use this first to understand what categories exist in the database
+    before performing filtered searches. Returns available:
+    - Form types (donor, volunteer)
+    - Countries (ASEAN country codes)
+    - Causes (education, health, etc.)
+    - Donor types (individual, corporate, foundation)
+    - Volunteer types (regular, event_based, skilled)
+    Returns:
+        JSON object with distinct values for each category
+    """
+    print("[Agentic RAG] list_available_categories called")
+    if _vector_store is None:
+        return "Error: RAG tools not initialized."
+    try:
+        async with _vector_store.pool.connection() as conn:
+            async with conn.cursor() as cur:
+                # Get form type counts
+                await cur.execute("""
+                    SELECT
+                        metadata->>'form_type' as form_type,
+                        COUNT(*) as count
+                    FROM my_embeddings
+                    GROUP BY metadata->>'form_type'
+                """)
+                form_types = {row[0]: row[1] for row in await cur.fetchall()}
+                # Get distinct countries
+                await cur.execute("""
+                    SELECT DISTINCT text_content::json->>'country' as country
+                    FROM my_embeddings
+                    WHERE text_content::json->>'country' IS NOT NULL
+                """)
+                countries = [row[0] for row in await cur.fetchall() if row[0]]
+                # Get distinct donor types
+                await cur.execute("""
+                    SELECT DISTINCT text_content::json->>'donor_type' as dtype
+                    FROM my_embeddings
+                    WHERE text_content::json->>'donor_type' IS NOT NULL
+                """)
+                donor_types = [row[0] for row in await cur.fetchall() if row[0]]
+                # Get distinct volunteer types
+                await cur.execute("""
+                    SELECT DISTINCT text_content::json->>'volunteer_type' as vtype
+                    FROM my_embeddings
+                    WHERE text_content::json->>'volunteer_type' IS NOT NULL
+                """)
+                volunteer_types = [row[0] for row in await cur.fetchall() if row[0]]
+                # Get all causes (need to aggregate from arrays)
+                await cur.execute("""
+                    SELECT text_content
+                    FROM my_embeddings
+                    WHERE text_content LIKE '%causes%'
+                    LIMIT 100
+                """)
+                rows = await cur.fetchall()
+                all_causes = set()
+                for row in rows:
+                    try:
+                        if isinstance(row[0], str):
+                            data = json.loads(row[0])
+                        else:
+                            data = row[0]
+                        causes = data.get("causes", [])
+                        if isinstance(causes, list):
+                            all_causes.update(causes)
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+        categories = {
+            "form_types": form_types,
+            "countries": sorted(countries),
+            "donor_types": sorted(donor_types),
+            "volunteer_types": sorted(volunteer_types),
+            "causes": sorted(all_causes),
+            "total_records": sum(form_types.values()) if form_types else 0
+        }
+        return json.dumps(categories, indent=2)
+    except Exception as e:
+        return f"Error listing categories: {str(e)}"
+@tool
+async def hybrid_search(
+    query: str,
+    country: Optional[str] = None,
+    form_type: Optional[str] = None,
+    causes: Optional[List[str]] = None,
+    limit: int = 10
+) -> str:
+    """Combine semantic search with metadata filters.
+    Use this for targeted searches that combine meaning (semantic)
+    with specific constraints (filters). More precise than pure
+    semantic search when you know specific criteria.
+    Args:
+        query: Natural language query for semantic matching
+        country: Optional country code filter (e.g., "SG", "MY", "TH")
+        form_type: Optional form type filter ("donor" or "volunteer")
+        causes: Optional list of cause categories to match
+        limit: Maximum number of results (default: 10)
+    Returns:
+        JSON formatted list of results matching both semantic query and filters
+    """
+    print(f"[Agentic RAG] hybrid_search called - query: '{query}', country: {country}, form_type: {form_type}, causes: {causes}, limit: {limit}")
+    if _encoder is None or _vector_store is None:
+        return "Error: RAG tools not initialized."
+    try:
+        # Encode the query
+        embedding = await _encoder.encode(query)
+        # Use cause-based hybrid search if causes specified
+        if causes and len(causes) > 0:
+            results = await _vector_store.find_by_causes(
+                target_causes=causes,
+                query_embedding=embedding,
+                limit=limit
+            )
+            # Apply additional filters if needed
+            if form_type or country:
+                filtered = []
+                for r in results:
+                    if form_type and r.form_type != form_type:
+                        continue
+                    if country and r.form_data.get("country") != country:
+                        continue
+                    filtered.append(r)
+                results = filtered[:limit]
+        else:
+            # Standard similarity search with filters
+            results = await _vector_store.find_similar(
+                query_embedding=embedding,
+                form_type=form_type,
+                limit=limit,
+                country_filter=country
+            )
+        return _format_results(results)
+    except Exception as e:
+        return f"Hybrid search error: {str(e)}"
+@tool
+async def get_statistics() -> str:
+    """Get overall statistics about the vector store.
+    Use this to understand the size and composition of the database
+    before starting your search.
+    Returns:
+        JSON with counts by form type and other aggregate stats
+    """
+    print("[Agentic RAG] get_statistics called")
+    if _vector_store is None:
+        return "Error: RAG tools not initialized."
+    try:
+        counts = await _vector_store.count_by_type()
+        return json.dumps({
+            "database_statistics": counts,
+            "description": "Number of entries by form type in the vector store"
+        }, indent=2)
+    except Exception as e:
+        return f"Error getting statistics: {str(e)}"
+# Export all RAG tools as a list for easy registration
+RAG_TOOLS = [
+    semantic_search,
+    filter_by_metadata,
+    get_document_by_id,
+    list_available_categories,
+    hybrid_search,
+    get_statistics,
+]

tools/web_search.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Web search tool for searching charity organization information using OpenAI."""
+import os
+from pprint import pprint
+from typing import Dict, Any, Optional
+from openai import OpenAI
+from langchain_core.tools import tool
+# Simple in-memory cache to avoid duplicate searches within a session
+_search_cache: Dict[str, str] = {}
+def get_openai_client():
+    """Get OpenAI client instance."""
+    return OpenAI()
+def clear_search_cache():
+    """Clear the search cache. Call this at the start of a new conversation."""
+    global _search_cache
+    _search_cache.clear()
+    print("🗑️ Search cache cleared")
+def openai_web_search(query: str, use_cache: bool = True) -> str:
+    """Perform web search using OpenAI's web_search tool.
+    Args:
+        query: The search query
+        use_cache: Whether to use cached results if available
+    Returns:
+        Search results as text
+    """
+    # Check cache first
+    cache_key = query.lower().strip()
+    if use_cache and cache_key in _search_cache:
+        print("\n" + "=" * 50)
+        print("📦 RETURNING CACHED SEARCH RESULT")
+        print("=" * 50)
+        pprint({"query": query, "cached": True})
+        print("=" * 50 + "\n")
+        return _search_cache[cache_key]
+    print("\n" + "=" * 50)
+    print("🔍 OPENAI WEB SEARCH CALLED")
+    print("=" * 50)
+    pprint({"query": query})
+    print("=" * 50 + "\n")
+    client = get_openai_client()
+    try:
+        response = client.responses.create(
+            model="gpt-5",
+            tools=[{"type": "web_search"}],
+            input=query
+        )
+        print("\n" + "-" * 50)
+        print("✅ SEARCH RESULTS RECEIVED")
+        print("-" * 50)
+        pprint({"output_length": len(response.output_text)})
+        print("-" * 50 + "\n")
+        # Cache the result
+        _search_cache[cache_key] = response.output_text
+        return response.output_text
+    except Exception as e:
+        print(f"\n❌ SEARCH FAILED: {str(e)}\n")
+        return f"Search failed: {str(e)}"
+@tool
+def search_charity_comprehensive(charity_name: str) -> str:
+    """Search the web for comprehensive information about a charity organization.
+    This tool performs a SINGLE optimized search to find ALL relevant information
+    about a charity including:
+    - Mission and programs
+    - Charity ratings (Charity Navigator, GuideStar, BBB)
+    - Financial transparency and accountability
+    - Recent news and impact reports
+    - Contact information and ways to donate
+    Use this as your PRIMARY tool - it combines general info and ratings in one search.
+    Args:
+        charity_name: The name of the charity organization to research.
+                     Example: "Red Cross" or "Doctors Without Borders"
+    Returns:
+        Comprehensive search results about the charity including ratings and programs.
+    """
+    print("\n📋 TOOL CALLED: search_charity_comprehensive")
+    pprint({"charity_name": charity_name})
+    # Build a comprehensive query that covers all aspects in ONE search
+    comprehensive_query = (
+        f"{charity_name} charity nonprofit organization "
+        f"mission programs impact "
+        f"Charity Navigator rating GuideStar "
+        f"financial transparency accountability review"
+    )
+    try:
+        results = openai_web_search(comprehensive_query)
+        return results
+    except Exception as e:
+        return f"Search failed: {str(e)}. Please try again with a different query."
+@tool
+def search_charity_info(query: str) -> str:
+    """Search the web for information about a charity organization.
+    NOTE: Prefer using search_charity_comprehensive for most queries as it
+    combines general info and ratings in a single search.
+    Use this tool only when you need to search for something very specific
+    that isn't covered by comprehensive search.
+    Args:
+        query: The search query about the charity organization.
+               Example: "Red Cross disaster relief programs 2024"
+    Returns:
+        Search results containing relevant information about the charity.
+    """
+    print("\n📋 TOOL CALLED: search_charity_info")
+    pprint({"input_query": query})
+    # Enhance query for charity-specific searches
+    enhanced_query = f"{query} charity nonprofit organization"
+    try:
+        results = openai_web_search(enhanced_query)
+        return results
+    except Exception as e:
+        return f"Search failed: {str(e)}. Please try again with a different query."
+@tool
+def search_charity_ratings(charity_name: str) -> str:
+    """Search for charity ratings and reviews from watchdog organizations.
+    NOTE: Prefer using search_charity_comprehensive as it already includes
+    rating information. Use this only if you specifically need MORE detailed
+    rating information after the comprehensive search.
+    Args:
+        charity_name: The name of the charity to look up ratings for.
+    Returns:
+        Information about the charity's ratings and accountability.
+    """
+    print("\n⭐ TOOL CALLED: search_charity_ratings")
+    pprint({"charity_name": charity_name})
+    query = f"{charity_name} charity rating Charity Navigator GuideStar review"
+    try:
+        results = openai_web_search(query)
+        return results
+    except Exception as e:
+        return f"Rating search failed: {str(e)}. Please try again."
+# List of all available tools for the charity search agent
+# Put comprehensive search FIRST so the LLM prefers it
+CHARITY_SEARCH_TOOLS = [search_charity_comprehensive, search_charity_info, search_charity_ratings]