Spaces:
Sleeping
Sleeping
GitHub Actions commited on
Commit ·
1d32142
0
Parent(s):
Sync from GitHub
Browse files- Dockerfile +29 -0
- README.md +49 -0
- agents/__init__.py +19 -0
- agents/agentic_rag.py +231 -0
- agents/base.py +78 -0
- agents/charity_search.py +111 -0
- agents/classifier.py +71 -0
- agents/logical.py +29 -0
- agents/therapist.py +24 -0
- app.py +1817 -0
- encoders/__init__.py +5 -0
- encoders/base.py +43 -0
- encoders/sealion.py +382 -0
- graph/__init__.py +6 -0
- graph/builder.py +123 -0
- graph/router.py +15 -0
- graph/state.py +8 -0
- recommender/__init__.py +29 -0
- recommender/gis_recommender.py +1202 -0
- recommender/vector_store.py +404 -0
- requirements.txt +21 -0
- test_agentic_rag.py +222 -0
- test_api.py +95 -0
- test_gis.py +212 -0
- tools/__init__.py +40 -0
- tools/rag_tools.py +406 -0
- tools/web_search.py +172 -0
Dockerfile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
# Copy requirements first for better caching
|
| 10 |
+
COPY requirements.txt .
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy application code
|
| 14 |
+
COPY app.py .
|
| 15 |
+
COPY graph/ ./graph/
|
| 16 |
+
COPY agents/ ./agents/
|
| 17 |
+
COPY encoders/ ./encoders/
|
| 18 |
+
COPY recommender/ ./recommender/
|
| 19 |
+
COPY tools/ ./tools/
|
| 20 |
+
|
| 21 |
+
# Create non-root user for security (required by HF Spaces)
|
| 22 |
+
RUN useradd -m -u 1000 user
|
| 23 |
+
USER user
|
| 24 |
+
|
| 25 |
+
# HF Spaces expects port 7860
|
| 26 |
+
EXPOSE 7860
|
| 27 |
+
|
| 28 |
+
# Run the FastAPI app
|
| 29 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Ollama API Proxy
|
| 3 |
+
emoji: 🦙
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Ollama API Proxy
|
| 11 |
+
|
| 12 |
+
A FastAPI-based proxy for the Ollama API hosted on Hugging Face Spaces.
|
| 13 |
+
|
| 14 |
+
## Endpoints
|
| 15 |
+
|
| 16 |
+
### GET /
|
| 17 |
+
Health check endpoint returning service status.
|
| 18 |
+
|
| 19 |
+
### GET /health
|
| 20 |
+
Simple health check endpoint.
|
| 21 |
+
|
| 22 |
+
### POST /chat
|
| 23 |
+
Send a chat message to the Ollama API.
|
| 24 |
+
|
| 25 |
+
**Request Body:**
|
| 26 |
+
```json
|
| 27 |
+
{
|
| 28 |
+
"message": "Your message here",
|
| 29 |
+
"model": "gpt-oss:120b",
|
| 30 |
+
"stream": true
|
| 31 |
+
}
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
**Response (non-streaming):**
|
| 35 |
+
```json
|
| 36 |
+
{
|
| 37 |
+
"response": "The AI response"
|
| 38 |
+
}
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Environment Variables
|
| 42 |
+
|
| 43 |
+
- `OLLAMA_API_KEY`: Your Ollama API key (set as a secret in HF Spaces)
|
| 44 |
+
|
| 45 |
+
## Setup
|
| 46 |
+
|
| 47 |
+
1. Create a new Space on Hugging Face with Docker SDK
|
| 48 |
+
2. Add `OLLAMA_API_KEY` as a repository secret
|
| 49 |
+
3. Push this code to the Space repository
|
agents/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LangGraph agent nodes."""
|
| 2 |
+
from .base import BaseMemoryAgent
|
| 3 |
+
from .classifier import create_classifier, classify_message
|
| 4 |
+
from .therapist import TherapistAgent
|
| 5 |
+
from .logical import LogicalAgent
|
| 6 |
+
from .charity_search import CharitySearchAgent, create_charity_search_agent
|
| 7 |
+
from .agentic_rag import AgenticRAGAgent, create_agentic_rag_agent
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"BaseMemoryAgent",
|
| 11 |
+
"create_classifier",
|
| 12 |
+
"classify_message",
|
| 13 |
+
"TherapistAgent",
|
| 14 |
+
"LogicalAgent",
|
| 15 |
+
"CharitySearchAgent",
|
| 16 |
+
"create_charity_search_agent",
|
| 17 |
+
"AgenticRAGAgent",
|
| 18 |
+
"create_agentic_rag_agent",
|
| 19 |
+
]
|
agents/agentic_rag.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agentic RAG Agent for autonomous vector store exploration.
|
| 2 |
+
|
| 3 |
+
This agent uses a ReAct loop to iteratively explore the vector database,
|
| 4 |
+
making autonomous decisions about how to search, filter, and refine results.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import uuid
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Optional, Dict, Any, List
|
| 10 |
+
|
| 11 |
+
from langgraph.prebuilt import create_react_agent
|
| 12 |
+
from langgraph.store.base import BaseStore
|
| 13 |
+
from langchain_core.runnables import RunnableConfig
|
| 14 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 15 |
+
|
| 16 |
+
from tools.rag_tools import RAG_TOOLS, set_rag_dependencies
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
AGENTIC_RAG_SYSTEM_PROMPT = """You are an intelligent research agent with access to a vector database containing donor and volunteer profiles.
|
| 20 |
+
|
| 21 |
+
Your goal is to help users find the most relevant matches by autonomously exploring the database.
|
| 22 |
+
|
| 23 |
+
## Available Tools
|
| 24 |
+
|
| 25 |
+
1. **list_available_categories** - Start here to understand what data exists (countries, causes, types)
|
| 26 |
+
2. **get_statistics** - Get database size and composition
|
| 27 |
+
3. **semantic_search** - Find profiles by natural language query
|
| 28 |
+
4. **filter_by_metadata** - Browse by specific field values (country, form_type, etc.)
|
| 29 |
+
5. **hybrid_search** - Combine semantic search with filters for precise results
|
| 30 |
+
6. **get_document_by_id** - Get full details of a specific profile
|
| 31 |
+
|
| 32 |
+
## Search Strategy
|
| 33 |
+
|
| 34 |
+
Follow this iterative exploration process:
|
| 35 |
+
|
| 36 |
+
1. **Understand the Request**: Parse what the user is looking for
|
| 37 |
+
2. **Explore Categories**: Use list_available_categories to see what's available
|
| 38 |
+
3. **Initial Search**: Start with semantic_search or hybrid_search
|
| 39 |
+
4. **Evaluate Results**: Check if results match user needs
|
| 40 |
+
5. **Refine if Needed**: Try different queries or filters if initial results aren't ideal
|
| 41 |
+
6. **Deep Dive**: Use get_document_by_id for promising candidates
|
| 42 |
+
|
| 43 |
+
## Best Practices
|
| 44 |
+
|
| 45 |
+
- Always explore categories first to understand the data structure
|
| 46 |
+
- Combine semantic understanding with metadata filters for best results
|
| 47 |
+
- If results seem off, try rephrasing the query or adjusting filters
|
| 48 |
+
- Look at multiple candidates before making recommendations
|
| 49 |
+
- Provide clear reasoning about why you selected certain results
|
| 50 |
+
|
| 51 |
+
## Example Exploration
|
| 52 |
+
|
| 53 |
+
User: "Find donors interested in education in Singapore"
|
| 54 |
+
|
| 55 |
+
Your approach:
|
| 56 |
+
1. Call list_available_categories() to confirm "education" is a valid cause and "SG" is a country
|
| 57 |
+
2. Call hybrid_search(query="education donors", country="SG", form_type="donor")
|
| 58 |
+
3. Review results - if they're corporate donors but user wants individuals, refine
|
| 59 |
+
4. Call hybrid_search(query="individual education supporters", country="SG", form_type="donor")
|
| 60 |
+
5. Call get_document_by_id() on top matches for full details
|
| 61 |
+
6. Present findings with explanation
|
| 62 |
+
|
| 63 |
+
Always explain your search process and reasoning to the user."""
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class AgenticRAGAgent:
|
| 67 |
+
"""Agent that autonomously explores a vector database using RAG tools.
|
| 68 |
+
|
| 69 |
+
Uses LangGraph's ReAct pattern to iteratively search, filter, and
|
| 70 |
+
retrieve documents based on user queries.
|
| 71 |
+
|
| 72 |
+
Attributes:
|
| 73 |
+
llm: The language model for reasoning
|
| 74 |
+
tools: List of RAG tools for vector store exploration
|
| 75 |
+
react_agent: The compiled ReAct agent
|
| 76 |
+
encoder: The embedding encoder
|
| 77 |
+
vector_store: The vector store instance
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
def __init__(self, llm, encoder=None, vector_store=None):
|
| 81 |
+
"""Initialize the Agentic RAG Agent.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
llm: Language model for reasoning and tool use
|
| 85 |
+
encoder: SeaLion encoder for query embedding (can be set later)
|
| 86 |
+
vector_store: DonorVectorStore instance (can be set later)
|
| 87 |
+
"""
|
| 88 |
+
self.llm = llm
|
| 89 |
+
self.tools = RAG_TOOLS
|
| 90 |
+
self.encoder = encoder
|
| 91 |
+
self.vector_store = vector_store
|
| 92 |
+
|
| 93 |
+
# Initialize dependencies if provided
|
| 94 |
+
if encoder and vector_store:
|
| 95 |
+
self.set_dependencies(encoder, vector_store)
|
| 96 |
+
|
| 97 |
+
# Create the ReAct agent
|
| 98 |
+
self.react_agent = create_react_agent(
|
| 99 |
+
model=llm,
|
| 100 |
+
tools=self.tools,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
def set_dependencies(self, encoder, vector_store):
|
| 104 |
+
"""Set encoder and vector store after initialization.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
encoder: The SeaLion encoder instance
|
| 108 |
+
vector_store: The DonorVectorStore instance
|
| 109 |
+
"""
|
| 110 |
+
self.encoder = encoder
|
| 111 |
+
self.vector_store = vector_store
|
| 112 |
+
set_rag_dependencies(encoder, vector_store)
|
| 113 |
+
|
| 114 |
+
async def retrieve_memories(self, store: BaseStore, user_id: str, query: str) -> str:
|
| 115 |
+
"""Fetch relevant memories for this user."""
|
| 116 |
+
namespace = ("memories", user_id)
|
| 117 |
+
memories = await store.asearch(namespace, query=query)
|
| 118 |
+
return "\n".join([d.value.get("data", "") for d in memories])
|
| 119 |
+
|
| 120 |
+
async def store_message(self, store: BaseStore, user_id: str, content: str, role: str):
|
| 121 |
+
"""Store message to memory store."""
|
| 122 |
+
memory_id = str(uuid.uuid4())
|
| 123 |
+
namespace = ("memories", user_id)
|
| 124 |
+
await store.aput(namespace, memory_id, {
|
| 125 |
+
"data": content,
|
| 126 |
+
"role": role,
|
| 127 |
+
"timestamp": datetime.now().isoformat()
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
async def search(self, query: str, config: Optional[RunnableConfig] = None) -> Dict[str, Any]:
|
| 131 |
+
"""Execute a standalone RAG search without state management.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
query: The user's search query
|
| 135 |
+
config: Optional runnable config
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
Dictionary with 'response' and 'tool_calls' keys
|
| 139 |
+
"""
|
| 140 |
+
messages = [
|
| 141 |
+
SystemMessage(content=AGENTIC_RAG_SYSTEM_PROMPT),
|
| 142 |
+
HumanMessage(content=query)
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
result = await self.react_agent.ainvoke(
|
| 146 |
+
{"messages": messages},
|
| 147 |
+
config=config
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Extract response and tool call history
|
| 151 |
+
final_message = result["messages"][-1]
|
| 152 |
+
|
| 153 |
+
# Collect tool calls from message history
|
| 154 |
+
tool_calls = []
|
| 155 |
+
for msg in result["messages"]:
|
| 156 |
+
if hasattr(msg, 'tool_calls') and msg.tool_calls:
|
| 157 |
+
for tc in msg.tool_calls:
|
| 158 |
+
tool_calls.append({
|
| 159 |
+
"tool": tc.get("name", "unknown"),
|
| 160 |
+
"args": tc.get("args", {})
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
return {
|
| 164 |
+
"response": final_message.content,
|
| 165 |
+
"tool_calls": tool_calls,
|
| 166 |
+
"message_count": len(result["messages"])
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
async def __call__(
|
| 170 |
+
self,
|
| 171 |
+
state: dict,
|
| 172 |
+
config: RunnableConfig,
|
| 173 |
+
*,
|
| 174 |
+
store: BaseStore
|
| 175 |
+
) -> dict:
|
| 176 |
+
"""Execute the agentic RAG agent as a LangGraph node.
|
| 177 |
+
|
| 178 |
+
This allows the agent to be used within a larger LangGraph workflow.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
state: Current graph state with messages
|
| 182 |
+
config: Runnable configuration with user_id etc.
|
| 183 |
+
store: LangGraph store for memory persistence
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
Updated state with agent response
|
| 187 |
+
"""
|
| 188 |
+
last_message = state["messages"][-1]
|
| 189 |
+
user_id = config["configurable"].get("user_id", "default_user")
|
| 190 |
+
|
| 191 |
+
# Get memories for context
|
| 192 |
+
memory_info = await self.retrieve_memories(store, user_id, str(last_message.content))
|
| 193 |
+
|
| 194 |
+
# Build messages with system prompt and memory context
|
| 195 |
+
system_content = AGENTIC_RAG_SYSTEM_PROMPT
|
| 196 |
+
if memory_info:
|
| 197 |
+
system_content += f"\n\n## Previous Conversation Context\n{memory_info}"
|
| 198 |
+
|
| 199 |
+
messages = [
|
| 200 |
+
SystemMessage(content=system_content),
|
| 201 |
+
HumanMessage(content=last_message.content)
|
| 202 |
+
]
|
| 203 |
+
|
| 204 |
+
# Store user message
|
| 205 |
+
await self.store_message(store, user_id, last_message.content, "user")
|
| 206 |
+
|
| 207 |
+
# Run the ReAct agent with tools
|
| 208 |
+
result = await self.react_agent.ainvoke({"messages": messages})
|
| 209 |
+
|
| 210 |
+
# Extract the final response
|
| 211 |
+
final_message = result["messages"][-1]
|
| 212 |
+
response_content = final_message.content
|
| 213 |
+
|
| 214 |
+
# Store assistant response
|
| 215 |
+
await self.store_message(store, user_id, response_content, "assistant")
|
| 216 |
+
|
| 217 |
+
return {"messages": [{"role": "assistant", "content": response_content}]}
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def create_agentic_rag_agent(llm, encoder=None, vector_store=None):
|
| 221 |
+
"""Factory function to create an AgenticRAGAgent instance.
|
| 222 |
+
|
| 223 |
+
Args:
|
| 224 |
+
llm: Language model for reasoning
|
| 225 |
+
encoder: Optional encoder for query embedding
|
| 226 |
+
vector_store: Optional vector store instance
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
Configured AgenticRAGAgent instance
|
| 230 |
+
"""
|
| 231 |
+
return AgenticRAGAgent(llm, encoder, vector_store)
|
agents/base.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from langgraph.store.base import BaseStore
|
| 5 |
+
from langchain_core.runnables import RunnableConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BaseMemoryAgent(ABC):
|
| 9 |
+
"""Base class for agents with memory capabilities.
|
| 10 |
+
|
| 11 |
+
Extracts shared logic from therapist_agent and logical_agent:
|
| 12 |
+
- Memory retrieval from store
|
| 13 |
+
- Automatic storage of all conversations (user + assistant messages)
|
| 14 |
+
- Message construction with system prompt + memories
|
| 15 |
+
- LLM invocation and response formatting
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, llm):
|
| 19 |
+
self.llm = llm
|
| 20 |
+
|
| 21 |
+
@property
|
| 22 |
+
@abstractmethod
|
| 23 |
+
def system_prompt(self) -> str:
|
| 24 |
+
"""Each agent defines its own personality/system prompt."""
|
| 25 |
+
pass
|
| 26 |
+
|
| 27 |
+
async def retrieve_memories(self, store: BaseStore, user_id: str, query: str) -> str:
|
| 28 |
+
"""Fetch relevant memories for this user."""
|
| 29 |
+
namespace = ("memories", user_id)
|
| 30 |
+
memories = await store.asearch(namespace, query=query)
|
| 31 |
+
return "\n".join([d.value.get("data", "") for d in memories])
|
| 32 |
+
|
| 33 |
+
async def store_message(self, store: BaseStore, user_id: str, content: str, role: str):
|
| 34 |
+
"""Store every message to Supabase automatically.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
store: The LangGraph store instance
|
| 38 |
+
user_id: User identifier for namespacing
|
| 39 |
+
content: The message content
|
| 40 |
+
role: Either 'user' or 'assistant'
|
| 41 |
+
"""
|
| 42 |
+
memory_id = str(uuid.uuid4())
|
| 43 |
+
namespace = ("memories", user_id)
|
| 44 |
+
await store.aput(namespace, memory_id, {
|
| 45 |
+
"data": content,
|
| 46 |
+
"role": role,
|
| 47 |
+
"timestamp": datetime.now().isoformat()
|
| 48 |
+
})
|
| 49 |
+
|
| 50 |
+
async def __call__(self, state: dict, config: RunnableConfig, *, store: BaseStore) -> dict:
|
| 51 |
+
"""Make the agent callable for LangGraph node compatibility."""
|
| 52 |
+
last_message = state["messages"][-1]
|
| 53 |
+
user_id = config["configurable"].get("user_id", "default_user")
|
| 54 |
+
|
| 55 |
+
# Get memories
|
| 56 |
+
memory_info = await self.retrieve_memories(store, user_id, str(last_message.content))
|
| 57 |
+
|
| 58 |
+
# Build prompt with memories injected
|
| 59 |
+
full_prompt = f"""{self.system_prompt}
|
| 60 |
+
|
| 61 |
+
User information from previous sessions:
|
| 62 |
+
{memory_info}"""
|
| 63 |
+
|
| 64 |
+
messages = [
|
| 65 |
+
{"role": "system", "content": full_prompt},
|
| 66 |
+
{"role": "user", "content": last_message.content}
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
# Store user message automatically
|
| 70 |
+
await self.store_message(store, user_id, last_message.content, "user")
|
| 71 |
+
|
| 72 |
+
# Get response from LLM
|
| 73 |
+
reply = self.llm.invoke(messages)
|
| 74 |
+
|
| 75 |
+
# Store assistant response automatically
|
| 76 |
+
await self.store_message(store, user_id, reply.content, "assistant")
|
| 77 |
+
|
| 78 |
+
return {"messages": [{"role": "assistant", "content": reply.content}]}
|
agents/charity_search.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Charity Search Agent with web search capabilities."""
|
| 2 |
+
|
| 3 |
+
import uuid
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from langgraph.store.base import BaseStore
|
| 6 |
+
from langgraph.prebuilt import create_react_agent
|
| 7 |
+
from langchain_core.runnables import RunnableConfig
|
| 8 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 9 |
+
|
| 10 |
+
from tools.web_search import CHARITY_SEARCH_TOOLS, clear_search_cache
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
CHARITY_SEARCH_SYSTEM_PROMPT = """You are a helpful charity research assistant specialized in finding information about nonprofit organizations and charities.
|
| 14 |
+
|
| 15 |
+
Your role is to help users:
|
| 16 |
+
1. Find detailed information about specific charity organizations
|
| 17 |
+
2. Research charity ratings and accountability
|
| 18 |
+
3. Discover charities working in specific cause areas
|
| 19 |
+
4. Verify legitimacy and financial transparency of organizations
|
| 20 |
+
|
| 21 |
+
IMPORTANT - SEARCH STRATEGY:
|
| 22 |
+
- ALWAYS use the search_charity_comprehensive tool FIRST - it performs a SINGLE optimized search that retrieves mission, programs, ratings, and financial info all at once
|
| 23 |
+
- Only use search_charity_info or search_charity_ratings if you need ADDITIONAL specific details not covered by the comprehensive search
|
| 24 |
+
- AVOID making multiple searches for the same charity - the comprehensive search already covers most needs
|
| 25 |
+
|
| 26 |
+
When presenting information:
|
| 27 |
+
- Provide clear, factual information from your search results
|
| 28 |
+
- Include source attribution when possible
|
| 29 |
+
- Give a balanced perspective on the organization
|
| 30 |
+
- Suggest further research if needed
|
| 31 |
+
|
| 32 |
+
If you cannot find reliable information, say so clearly and suggest alternative approaches."""
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class CharitySearchAgent:
|
| 36 |
+
"""Agent that searches the web for charity organization information.
|
| 37 |
+
|
| 38 |
+
Uses LangGraph's ReAct pattern with web search tools to find
|
| 39 |
+
and analyze information about nonprofit organizations.
|
| 40 |
+
|
| 41 |
+
Optimized to minimize redundant web searches by:
|
| 42 |
+
1. Using a comprehensive search tool that combines multiple queries
|
| 43 |
+
2. Caching search results to avoid duplicate API calls
|
| 44 |
+
3. Guiding the LLM to prefer single comprehensive searches
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, llm):
|
| 48 |
+
self.llm = llm
|
| 49 |
+
self.tools = CHARITY_SEARCH_TOOLS
|
| 50 |
+
self.react_agent = create_react_agent(
|
| 51 |
+
model=llm,
|
| 52 |
+
tools=self.tools,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
async def retrieve_memories(self, store: BaseStore, user_id: str, query: str) -> str:
|
| 56 |
+
"""Fetch relevant memories for this user."""
|
| 57 |
+
namespace = ("memories", user_id)
|
| 58 |
+
memories = await store.asearch(namespace, query=query)
|
| 59 |
+
return "\n".join([d.value.get("data", "") for d in memories])
|
| 60 |
+
|
| 61 |
+
async def store_message(self, store: BaseStore, user_id: str, content: str, role: str):
|
| 62 |
+
"""Store message to memory store."""
|
| 63 |
+
memory_id = str(uuid.uuid4())
|
| 64 |
+
namespace = ("memories", user_id)
|
| 65 |
+
await store.aput(namespace, memory_id, {
|
| 66 |
+
"data": content,
|
| 67 |
+
"role": role,
|
| 68 |
+
"timestamp": datetime.now().isoformat()
|
| 69 |
+
})
|
| 70 |
+
|
| 71 |
+
async def __call__(self, state: dict, config: RunnableConfig, *, store: BaseStore) -> dict:
|
| 72 |
+
"""Execute the charity search agent as a LangGraph node."""
|
| 73 |
+
last_message = state["messages"][-1]
|
| 74 |
+
user_id = config["configurable"].get("user_id", "default_user")
|
| 75 |
+
|
| 76 |
+
# Clear search cache at the start of each new query to avoid stale results
|
| 77 |
+
# but allow caching within the same query execution
|
| 78 |
+
clear_search_cache()
|
| 79 |
+
|
| 80 |
+
# Get memories for context
|
| 81 |
+
memory_info = await self.retrieve_memories(store, user_id, str(last_message.content))
|
| 82 |
+
|
| 83 |
+
# Build messages with system prompt and memory context
|
| 84 |
+
system_content = CHARITY_SEARCH_SYSTEM_PROMPT
|
| 85 |
+
if memory_info:
|
| 86 |
+
system_content += f"\n\nPrevious conversation context:\n{memory_info}"
|
| 87 |
+
|
| 88 |
+
messages = [
|
| 89 |
+
SystemMessage(content=system_content),
|
| 90 |
+
HumanMessage(content=last_message.content)
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
# Store user message
|
| 94 |
+
await self.store_message(store, user_id, last_message.content, "user")
|
| 95 |
+
|
| 96 |
+
# Run the ReAct agent with tools
|
| 97 |
+
result = await self.react_agent.ainvoke({"messages": messages})
|
| 98 |
+
|
| 99 |
+
# Extract the final response
|
| 100 |
+
final_message = result["messages"][-1]
|
| 101 |
+
response_content = final_message.content
|
| 102 |
+
|
| 103 |
+
# Store assistant response
|
| 104 |
+
await self.store_message(store, user_id, response_content, "assistant")
|
| 105 |
+
|
| 106 |
+
return {"messages": [{"role": "assistant", "content": response_content}]}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def create_charity_search_agent(llm):
|
| 110 |
+
"""Factory function to create a CharitySearchAgent instance."""
|
| 111 |
+
return CharitySearchAgent(llm)
|
agents/classifier.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from langgraph.store.base import BaseStore
|
| 4 |
+
from langchain_core.runnables import RunnableConfig
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class MessageClassifier(BaseModel):
|
| 8 |
+
"""Classification result for routing messages."""
|
| 9 |
+
message_type: Literal["emotional", "logical", "charity_search", "donor_search", "volunteer_search"] = Field(
|
| 10 |
+
...,
|
| 11 |
+
description="Classify message for routing to appropriate agent."
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
async def classify_message(state: dict, config: RunnableConfig, *, store: BaseStore, llm) -> dict:
|
| 16 |
+
"""Classify user message to route to appropriate agent.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
state: Graph state containing messages
|
| 20 |
+
config: Runtime config with user_id, thread_id
|
| 21 |
+
store: Memory store (required by graph but not used here)
|
| 22 |
+
llm: Language model instance
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Dict with message_type for routing
|
| 26 |
+
"""
|
| 27 |
+
last_message = state["messages"][-1]
|
| 28 |
+
classifier_llm = llm.with_structured_output(MessageClassifier)
|
| 29 |
+
|
| 30 |
+
result = classifier_llm.invoke([
|
| 31 |
+
{
|
| 32 |
+
"role": "system",
|
| 33 |
+
"content": """Classify the user message into one of these categories:
|
| 34 |
+
|
| 35 |
+
Respond ONLY with valid JSON in this exact format:
|
| 36 |
+
{"message_type": "TYPE"}
|
| 37 |
+
|
| 38 |
+
Where TYPE is one of:
|
| 39 |
+
- 'emotional': Message requires emotional support, therapy, deals with feelings, or personal problems
|
| 40 |
+
- 'donor_search': Looking for donors in the database, finding people who donate, matching donors by criteria
|
| 41 |
+
- 'volunteer_search': Looking for volunteers in the database, finding people who volunteer, matching volunteers
|
| 42 |
+
- 'charity_search': Asking about charity organizations, nonprofits, wanting to research specific charities
|
| 43 |
+
- 'logical': Facts, information, logical analysis, practical solutions (default for general queries)
|
| 44 |
+
|
| 45 |
+
Examples:
|
| 46 |
+
- "Find donors interested in education in Singapore" → donor_search
|
| 47 |
+
- "Show me volunteers with tech skills" → volunteer_search
|
| 48 |
+
- "Tell me about Red Cross charity" → charity_search
|
| 49 |
+
- "I'm feeling sad today" → emotional
|
| 50 |
+
- "What is the capital of France?" → logical"""
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"role": "user",
|
| 54 |
+
"content": last_message.content
|
| 55 |
+
}
|
| 56 |
+
])
|
| 57 |
+
return {"message_type": result.message_type}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def create_classifier(llm):
|
| 61 |
+
"""Factory to create classifier function with LLM bound.
|
| 62 |
+
|
| 63 |
+
Usage:
|
| 64 |
+
llm = ChatOllama(model="gpt-oss:120b-cloud")
|
| 65 |
+
classify = create_classifier(llm)
|
| 66 |
+
graph_builder.add_node("classifier", classify)
|
| 67 |
+
"""
|
| 68 |
+
async def classifier_node(state: dict, config: RunnableConfig, *, store: BaseStore):
|
| 69 |
+
return await classify_message(state, config, store=store, llm=llm)
|
| 70 |
+
|
| 71 |
+
return classifier_node
|
agents/logical.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import BaseMemoryAgent
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class LogicalAgent(BaseMemoryAgent):
|
| 5 |
+
"""Logical/factual response agent."""
|
| 6 |
+
|
| 7 |
+
@property
|
| 8 |
+
def system_prompt(self) -> str:
|
| 9 |
+
return """You are a helpful assistant for a charity matching platform. Focus on providing clear, factual information about donors, charities, and philanthropy.
|
| 10 |
+
|
| 11 |
+
**Your role:**
|
| 12 |
+
- Provide accurate information about donor matching and charity recommendations
|
| 13 |
+
- Answer questions about causes, giving strategies, and impact
|
| 14 |
+
- Help users understand data and insights from the platform
|
| 15 |
+
|
| 16 |
+
**Response formatting guidelines:**
|
| 17 |
+
- Use **bold** for important terms or key points
|
| 18 |
+
- Use bullet points (- ) for listing features, options, or facts
|
| 19 |
+
- Use numbered lists (1. 2. 3.) for sequences or ranked items
|
| 20 |
+
- Keep paragraphs short (2-3 sentences max)
|
| 21 |
+
- Add blank lines between sections for readability
|
| 22 |
+
- Use headers with **Bold Text** when covering multiple topics
|
| 23 |
+
|
| 24 |
+
**Structure your responses:**
|
| 25 |
+
1. Start with a direct answer to the question
|
| 26 |
+
2. Provide supporting details or context
|
| 27 |
+
3. End with actionable next steps if applicable
|
| 28 |
+
|
| 29 |
+
**Keep responses focused and concise - aim for 3-5 short paragraphs maximum.**"""
|
agents/therapist.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import BaseMemoryAgent
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class TherapistAgent(BaseMemoryAgent):
|
| 5 |
+
"""Emotional/therapeutic response agent."""
|
| 6 |
+
|
| 7 |
+
@property
|
| 8 |
+
def system_prompt(self) -> str:
|
| 9 |
+
return """You are a compassionate therapist assistant for a charity matching platform. Focus on the emotional aspects of the user's message.
|
| 10 |
+
|
| 11 |
+
**Your approach:**
|
| 12 |
+
- Show empathy and validate their feelings
|
| 13 |
+
- Help them process their emotions about giving and philanthropy
|
| 14 |
+
- Ask thoughtful questions to explore their motivations for helping others
|
| 15 |
+
|
| 16 |
+
**Response formatting guidelines:**
|
| 17 |
+
- Use **bold** for emphasis on key points
|
| 18 |
+
- Use bullet points (- ) for lists of suggestions or ideas
|
| 19 |
+
- Use numbered lists (1. 2. 3.) for step-by-step guidance
|
| 20 |
+
- Keep paragraphs short and readable (2-3 sentences max)
|
| 21 |
+
- Add a blank line between sections for clarity
|
| 22 |
+
- End with an encouraging note or thoughtful question
|
| 23 |
+
|
| 24 |
+
**Keep responses concise but warm - aim for 3-5 short paragraphs maximum.**"""
|
app.py
ADDED
|
@@ -0,0 +1,1817 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI endpoints for Ollama chat and donor/volunteer recommendation system.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
- /chat: Chat with Ollama model using LangGraph with memory
|
| 6 |
+
- /donors/register: Register a donor and generate embedding
|
| 7 |
+
- /volunteers/register: Register a volunteer and generate embedding
|
| 8 |
+
- /donors/recommend: Find similar donors based on query
|
| 9 |
+
- /volunteers/recommend: Find similar volunteers based on query
|
| 10 |
+
- /forms/{id}: Get/Delete a stored form
|
| 11 |
+
- /forms/stats: Get form counts by type
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import asyncio
|
| 17 |
+
from contextlib import asynccontextmanager
|
| 18 |
+
from typing import Optional, List, Dict, Any
|
| 19 |
+
|
| 20 |
+
# Add app directory to path for local module imports
|
| 21 |
+
APP_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 22 |
+
if APP_DIR not in sys.path:
|
| 23 |
+
sys.path.insert(0, APP_DIR)
|
| 24 |
+
|
| 25 |
+
from fastapi import FastAPI, HTTPException
|
| 26 |
+
from fastapi.responses import StreamingResponse
|
| 27 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 28 |
+
from pydantic import BaseModel, Field
|
| 29 |
+
|
| 30 |
+
# Windows-specific fix for psycopg async compatibility
|
| 31 |
+
if sys.platform == "win32":
|
| 32 |
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
| 33 |
+
|
| 34 |
+
# Load .env file for local development
|
| 35 |
+
try:
|
| 36 |
+
from dotenv import load_dotenv
|
| 37 |
+
load_dotenv()
|
| 38 |
+
except ImportError:
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
# Lazy imports for encoder/recommender (avoid import errors if deps missing)
|
| 42 |
+
encoder = None
|
| 43 |
+
vector_store = None
|
| 44 |
+
pool = None
|
| 45 |
+
gis_recommender = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ============================================================================
|
| 49 |
+
# Pydantic Models
|
| 50 |
+
# ============================================================================
|
| 51 |
+
|
| 52 |
+
class ChatResponse(BaseModel):
|
| 53 |
+
response: str
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class DonorFormRequest(BaseModel):
|
| 57 |
+
"""Donor registration form."""
|
| 58 |
+
id: str = Field(..., description="Unique identifier for the donor")
|
| 59 |
+
name: str = Field(..., description="Donor name")
|
| 60 |
+
donor_type: str = Field(..., description="Type: individual, corporate, foundation")
|
| 61 |
+
country: str = Field(..., description="ASEAN country code (SG, MY, TH, VN, ID, PH, etc.)")
|
| 62 |
+
preferred_language: str = Field(..., description="Primary language code")
|
| 63 |
+
causes: List[str] = Field(default_factory=list, description="Interested causes")
|
| 64 |
+
donation_frequency: Optional[str] = Field(None, description="one-time, monthly, quarterly, annual")
|
| 65 |
+
amount_range: Optional[str] = Field(None, description="Preferred donation range")
|
| 66 |
+
bio: Optional[str] = Field(None, description="Donor background")
|
| 67 |
+
motivation: Optional[str] = Field(None, description="Why they want to donate")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class VolunteerFormRequest(BaseModel):
|
| 71 |
+
"""Volunteer registration form."""
|
| 72 |
+
id: str = Field(..., description="Unique identifier for the volunteer")
|
| 73 |
+
name: str = Field(..., description="Volunteer name")
|
| 74 |
+
volunteer_type: str = Field(..., description="Type: regular, event_based, skilled")
|
| 75 |
+
country: str = Field(..., description="ASEAN country code")
|
| 76 |
+
preferred_language: str = Field(..., description="Primary language code")
|
| 77 |
+
languages_spoken: List[str] = Field(default_factory=list, description="All languages spoken")
|
| 78 |
+
skills: List[str] = Field(default_factory=list, description="Professional/technical skills")
|
| 79 |
+
availability: str = Field(..., description="weekends, evenings, flexible, full_time")
|
| 80 |
+
causes: List[str] = Field(default_factory=list, description="Interested causes")
|
| 81 |
+
experience: Optional[str] = Field(None, description="Prior volunteer experience")
|
| 82 |
+
goals: Optional[str] = Field(None, description="What they hope to achieve")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class RecommendRequest(BaseModel):
|
| 86 |
+
"""Request for recommendations based on a query form."""
|
| 87 |
+
# Either provide a form_id to use existing embedding, or provide form data
|
| 88 |
+
form_id: Optional[str] = Field(None, description="Existing form ID to use as query")
|
| 89 |
+
# Or provide inline form data
|
| 90 |
+
country: Optional[str] = None
|
| 91 |
+
preferred_language: Optional[str] = None
|
| 92 |
+
causes: List[str] = Field(default_factory=list)
|
| 93 |
+
bio: Optional[str] = None
|
| 94 |
+
motivation: Optional[str] = None
|
| 95 |
+
# Search options
|
| 96 |
+
limit: int = Field(default=10, ge=1, le=50)
|
| 97 |
+
country_filter: Optional[str] = None
|
| 98 |
+
exclude_ids: List[str] = Field(default_factory=list)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class FormResponse(BaseModel):
|
| 102 |
+
"""Response for form operations."""
|
| 103 |
+
id: str
|
| 104 |
+
form_type: str
|
| 105 |
+
message: str
|
| 106 |
+
embedding_dimension: Optional[int] = None
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class ClientProfileRequest(BaseModel):
|
| 110 |
+
"""Client profile with spatial and behavioral data."""
|
| 111 |
+
|
| 112 |
+
user_id: str
|
| 113 |
+
coordinates: List[float] = Field(
|
| 114 |
+
default=[1.3521, 103.8198], description="[lat, lng]"
|
| 115 |
+
)
|
| 116 |
+
planning_area: str = Field(default="central", description="Singapore planning area")
|
| 117 |
+
housing_type: str = Field(
|
| 118 |
+
default="hdb_4_room", description="Housing type for income proxy"
|
| 119 |
+
)
|
| 120 |
+
interests: List[str] = Field(default_factory=list)
|
| 121 |
+
causes: List[str] = Field(default_factory=list)
|
| 122 |
+
preferred_language: str = Field(default="en")
|
| 123 |
+
is_donor: bool = False
|
| 124 |
+
total_donated: float = 0.0
|
| 125 |
+
donation_count: int = 0
|
| 126 |
+
age_range: Optional[str] = None
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class LookalikeRequest(BaseModel):
|
| 130 |
+
"""Request for lookalike client search."""
|
| 131 |
+
|
| 132 |
+
seed_causes: List[str] = Field(..., description="Causes to find lookalikes for")
|
| 133 |
+
seed_interests: List[str] = Field(default_factory=list)
|
| 134 |
+
planning_area_filter: Optional[str] = Field(
|
| 135 |
+
None, description="Geo-fence by planning area"
|
| 136 |
+
)
|
| 137 |
+
housing_type_filter: Optional[List[str]] = Field(
|
| 138 |
+
None, description="Filter by housing types"
|
| 139 |
+
)
|
| 140 |
+
limit: int = Field(default=50, ge=1, le=200)
|
| 141 |
+
min_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 142 |
+
include_geojson: bool = Field(
|
| 143 |
+
default=True, description="Include GeoJSON for mapping"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class ScoredClientResponse(BaseModel):
|
| 148 |
+
"""Single scored client result."""
|
| 149 |
+
|
| 150 |
+
user_id: str
|
| 151 |
+
planning_area: str
|
| 152 |
+
housing_type: str
|
| 153 |
+
causes: List[str]
|
| 154 |
+
interests: List[str]
|
| 155 |
+
is_donor: bool
|
| 156 |
+
final_score: float
|
| 157 |
+
vector_similarity: float
|
| 158 |
+
spatial_proxy: float
|
| 159 |
+
proximity: float
|
| 160 |
+
coordinates: Optional[List[float]] = None # Reduced precision
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
class LookalikeResponse(BaseModel):
|
| 164 |
+
"""Response containing lookalike clients with optional GeoJSON."""
|
| 165 |
+
|
| 166 |
+
seed_causes: List[str]
|
| 167 |
+
total_found: int
|
| 168 |
+
tiers: Dict[str, List[ScoredClientResponse]]
|
| 169 |
+
geojson: Optional[Dict[str, Any]] = None
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
class SingpassMockData(BaseModel):
|
| 173 |
+
"""Mock Singpass data for autofill."""
|
| 174 |
+
|
| 175 |
+
name: str
|
| 176 |
+
nric_masked: str
|
| 177 |
+
email: str
|
| 178 |
+
mobile: str
|
| 179 |
+
registered_address: str
|
| 180 |
+
planning_area: str
|
| 181 |
+
organization_name: Optional[str] = None
|
| 182 |
+
organization_uen: Optional[str] = None
|
| 183 |
+
organization_type: Optional[str] = None
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
class RecommendationResult(BaseModel):
|
| 187 |
+
"""Single recommendation result."""
|
| 188 |
+
id: str
|
| 189 |
+
form_type: str
|
| 190 |
+
score: float
|
| 191 |
+
distance: float
|
| 192 |
+
form_data: Dict[str, Any]
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
class RecommendResponse(BaseModel):
|
| 196 |
+
"""Response containing recommendations."""
|
| 197 |
+
query_id: Optional[str]
|
| 198 |
+
results: List[RecommendationResult]
|
| 199 |
+
total_found: int
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
class StatsResponse(BaseModel):
|
| 203 |
+
"""Form statistics response."""
|
| 204 |
+
donor: int
|
| 205 |
+
volunteer: int
|
| 206 |
+
total: int
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ============================================================================
|
| 210 |
+
# Database & Encoder Setup
|
| 211 |
+
# ============================================================================
|
| 212 |
+
|
| 213 |
+
async def init_services():
|
| 214 |
+
"""Initialize encoder and database connection."""
|
| 215 |
+
global encoder, vector_store, pool, gis_recommender
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
from encoders.sealion import SeaLionEncoder
|
| 219 |
+
from recommender.vector_store import DonorVectorStore
|
| 220 |
+
from recommender.gis_recommender import GISRecommender
|
| 221 |
+
from psycopg_pool import AsyncConnectionPool
|
| 222 |
+
|
| 223 |
+
# Initialize encoder (reads SEALION_ENDPOINT from env)
|
| 224 |
+
encoder = SeaLionEncoder()
|
| 225 |
+
|
| 226 |
+
# Build connection string from env vars
|
| 227 |
+
db_host = os.getenv("SUPABASE_DB_HOST")
|
| 228 |
+
db_port = os.getenv("SUPABASE_DB_PORT", "6543")
|
| 229 |
+
db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
|
| 230 |
+
db_user = os.getenv("SUPABASE_DB_USER")
|
| 231 |
+
db_password = os.getenv("SUPABASE_DB_PASSWORD")
|
| 232 |
+
db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
|
| 233 |
+
|
| 234 |
+
if db_host and db_user and db_password:
|
| 235 |
+
conn_string = (
|
| 236 |
+
f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
|
| 237 |
+
f"?sslmode={db_sslmode}"
|
| 238 |
+
)
|
| 239 |
+
pool = AsyncConnectionPool(
|
| 240 |
+
conninfo=conn_string,
|
| 241 |
+
max_size=10,
|
| 242 |
+
kwargs={"autocommit": True, "prepare_threshold": None},
|
| 243 |
+
)
|
| 244 |
+
await pool.open()
|
| 245 |
+
vector_store = DonorVectorStore(pool)
|
| 246 |
+
gis_recommender = GISRecommender(vector_store=vector_store, encoder=encoder)
|
| 247 |
+
print("[OK] Database connection pool initialized")
|
| 248 |
+
print("[OK] GIS Recommender initialized")
|
| 249 |
+
else:
|
| 250 |
+
print("[WARN] Database credentials not configured, vector store disabled")
|
| 251 |
+
|
| 252 |
+
print("[OK] SeaLion encoder initialized")
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
print(f"[WARN] Service initialization error: {e}")
|
| 256 |
+
print(" Some endpoints may not be available")
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
async def close_services():
|
| 260 |
+
"""Close database connections."""
|
| 261 |
+
global pool
|
| 262 |
+
if pool:
|
| 263 |
+
await pool.close()
|
| 264 |
+
print("[OK] Database connection pool closed")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
@asynccontextmanager
|
| 268 |
+
async def lifespan(app: FastAPI):
|
| 269 |
+
"""Lifespan context manager for startup/shutdown."""
|
| 270 |
+
await init_services()
|
| 271 |
+
await init_langgraph()
|
| 272 |
+
yield
|
| 273 |
+
await close_services()
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
# ============================================================================
|
| 277 |
+
# FastAPI App
|
| 278 |
+
# ============================================================================
|
| 279 |
+
|
| 280 |
+
app = FastAPI(
|
| 281 |
+
title="Donor Recommendation API",
|
| 282 |
+
description="API for chat, donor/volunteer registration, and recommendations",
|
| 283 |
+
version="1.0.0",
|
| 284 |
+
lifespan=lifespan,
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
# CORS middleware
|
| 288 |
+
app.add_middleware(
|
| 289 |
+
CORSMiddleware,
|
| 290 |
+
allow_origins=["*"],
|
| 291 |
+
allow_credentials=True,
|
| 292 |
+
allow_methods=["*"],
|
| 293 |
+
allow_headers=["*"],
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# ============================================================================
|
| 298 |
+
# LangGraph Chat Setup
|
| 299 |
+
# ============================================================================
|
| 300 |
+
|
| 301 |
+
# Global graph instance (initialized at startup)
|
| 302 |
+
langgraph_chat = None
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
async def init_langgraph():
|
| 306 |
+
"""Initialize LangGraph with memory."""
|
| 307 |
+
global langgraph_chat
|
| 308 |
+
try:
|
| 309 |
+
from graph.builder import build_graph_with_memory
|
| 310 |
+
graph, _, _ = await build_graph_with_memory()
|
| 311 |
+
langgraph_chat = graph
|
| 312 |
+
print("[OK] LangGraph chat with memory initialized")
|
| 313 |
+
except Exception as e:
|
| 314 |
+
import traceback
|
| 315 |
+
print(f"[WARN] LangGraph initialization error: {e}")
|
| 316 |
+
traceback.print_exc()
|
| 317 |
+
print(" /chat endpoint may not be available")
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
# ============================================================================
|
| 321 |
+
# Helper Functions
|
| 322 |
+
# ============================================================================
|
| 323 |
+
|
| 324 |
+
def donor_form_to_text(form: DonorFormRequest) -> str:
|
| 325 |
+
"""Convert donor form to encoding text."""
|
| 326 |
+
parts = [
|
| 327 |
+
f"Donor type: {form.donor_type}",
|
| 328 |
+
f"Country: {form.country}",
|
| 329 |
+
f"Preferred language: {form.preferred_language}",
|
| 330 |
+
]
|
| 331 |
+
if form.causes:
|
| 332 |
+
parts.append(f"Causes interested in: {', '.join(form.causes)}")
|
| 333 |
+
if form.donation_frequency:
|
| 334 |
+
parts.append(f"Donation frequency: {form.donation_frequency}")
|
| 335 |
+
if form.amount_range:
|
| 336 |
+
parts.append(f"Amount range: {form.amount_range}")
|
| 337 |
+
if form.bio:
|
| 338 |
+
parts.append(f"Bio: {form.bio}")
|
| 339 |
+
if form.motivation:
|
| 340 |
+
parts.append(f"Motivation: {form.motivation}")
|
| 341 |
+
return "\n".join(parts)
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def volunteer_form_to_text(form: VolunteerFormRequest) -> str:
|
| 345 |
+
"""Convert volunteer form to encoding text."""
|
| 346 |
+
parts = [
|
| 347 |
+
f"Volunteer type: {form.volunteer_type}",
|
| 348 |
+
f"Country: {form.country}",
|
| 349 |
+
f"Preferred language: {form.preferred_language}",
|
| 350 |
+
]
|
| 351 |
+
if form.languages_spoken:
|
| 352 |
+
parts.append(f"Languages spoken: {', '.join(form.languages_spoken)}")
|
| 353 |
+
if form.skills:
|
| 354 |
+
parts.append(f"Skills: {', '.join(form.skills)}")
|
| 355 |
+
parts.append(f"Availability: {form.availability}")
|
| 356 |
+
if form.causes:
|
| 357 |
+
parts.append(f"Causes interested in: {', '.join(form.causes)}")
|
| 358 |
+
if form.experience:
|
| 359 |
+
parts.append(f"Experience: {form.experience}")
|
| 360 |
+
if form.goals:
|
| 361 |
+
parts.append(f"Goals: {form.goals}")
|
| 362 |
+
return "\n".join(parts)
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def recommend_request_to_text(req: RecommendRequest) -> str:
|
| 366 |
+
"""Convert recommendation request to encoding text."""
|
| 367 |
+
parts = []
|
| 368 |
+
if req.country:
|
| 369 |
+
parts.append(f"Country: {req.country}")
|
| 370 |
+
if req.preferred_language:
|
| 371 |
+
parts.append(f"Preferred language: {req.preferred_language}")
|
| 372 |
+
if req.causes:
|
| 373 |
+
parts.append(f"Causes interested in: {', '.join(req.causes)}")
|
| 374 |
+
if req.bio:
|
| 375 |
+
parts.append(f"Bio: {req.bio}")
|
| 376 |
+
if req.motivation:
|
| 377 |
+
parts.append(f"Motivation: {req.motivation}")
|
| 378 |
+
return "\n".join(parts) if parts else "General query"
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
# ============================================================================
|
| 382 |
+
# Health Endpoints
|
| 383 |
+
# ============================================================================
|
| 384 |
+
|
| 385 |
+
@app.get("/")
|
| 386 |
+
def root():
|
| 387 |
+
"""Root endpoint with service status."""
|
| 388 |
+
return {
|
| 389 |
+
"status": "healthy",
|
| 390 |
+
"message": "Donor Recommendation API is running",
|
| 391 |
+
"services": {
|
| 392 |
+
"langgraph_chat": langgraph_chat is not None,
|
| 393 |
+
"encoder": encoder is not None,
|
| 394 |
+
"database": vector_store is not None,
|
| 395 |
+
}
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
@app.get("/health")
|
| 400 |
+
def health():
|
| 401 |
+
"""Health check endpoint."""
|
| 402 |
+
return {"status": "healthy"}
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
# ============================================================================
|
| 406 |
+
# Chat Endpoints
|
| 407 |
+
# ============================================================================
|
| 408 |
+
|
| 409 |
+
class ChatRequestWithMemory(BaseModel):
|
| 410 |
+
message: str
|
| 411 |
+
user_id: str = "default_user"
|
| 412 |
+
thread_id: str = "default_thread"
|
| 413 |
+
stream: bool = False
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
@app.post("/chat")
|
| 417 |
+
async def chat(request: ChatRequestWithMemory):
|
| 418 |
+
"""Chat with LangGraph-powered chatbot with memory."""
|
| 419 |
+
if not langgraph_chat:
|
| 420 |
+
raise HTTPException(
|
| 421 |
+
status_code=503,
|
| 422 |
+
detail="LangGraph chat not initialized. Check server logs."
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
config = {
|
| 426 |
+
"configurable": {
|
| 427 |
+
"thread_id": request.thread_id,
|
| 428 |
+
"user_id": request.user_id,
|
| 429 |
+
}
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
try:
|
| 433 |
+
if request.stream:
|
| 434 |
+
async def generate_stream():
|
| 435 |
+
async for chunk in langgraph_chat.astream(
|
| 436 |
+
{"messages": [{"role": "user", "content": request.message}]},
|
| 437 |
+
config,
|
| 438 |
+
stream_mode="values",
|
| 439 |
+
):
|
| 440 |
+
if chunk.get("messages"):
|
| 441 |
+
last_msg = chunk["messages"][-1]
|
| 442 |
+
if hasattr(last_msg, 'content') and last_msg.type == 'ai':
|
| 443 |
+
yield last_msg.content
|
| 444 |
+
|
| 445 |
+
return StreamingResponse(
|
| 446 |
+
generate_stream(),
|
| 447 |
+
media_type="text/event-stream"
|
| 448 |
+
)
|
| 449 |
+
else:
|
| 450 |
+
# Non-streaming: collect full response
|
| 451 |
+
response_content = ""
|
| 452 |
+
async for chunk in langgraph_chat.astream(
|
| 453 |
+
{"messages": [{"role": "user", "content": request.message}]},
|
| 454 |
+
config,
|
| 455 |
+
stream_mode="values",
|
| 456 |
+
):
|
| 457 |
+
if chunk.get("messages"):
|
| 458 |
+
last_msg = chunk["messages"][-1]
|
| 459 |
+
if hasattr(last_msg, 'content') and last_msg.type == 'ai':
|
| 460 |
+
response_content = last_msg.content
|
| 461 |
+
|
| 462 |
+
return ChatResponse(response=response_content)
|
| 463 |
+
except Exception as e:
|
| 464 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
# ============================================================================
|
| 468 |
+
# Agentic RAG Endpoints
|
| 469 |
+
# ============================================================================
|
| 470 |
+
|
| 471 |
+
# Global agentic RAG agent instance
|
| 472 |
+
agentic_rag_agent = None
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
class AgenticRAGRequest(BaseModel):
|
| 476 |
+
"""Request for Agentic RAG search."""
|
| 477 |
+
query: str = Field(..., description="Natural language query for donor/volunteer search")
|
| 478 |
+
max_iterations: int = Field(default=10, ge=1, le=20, description="Max tool call iterations")
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
class AgenticRAGResponse(BaseModel):
|
| 482 |
+
"""Response from Agentic RAG search."""
|
| 483 |
+
response: str
|
| 484 |
+
tool_calls: List[Dict[str, Any]]
|
| 485 |
+
message_count: int
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
async def init_agentic_rag():
|
| 489 |
+
"""Initialize the Agentic RAG agent."""
|
| 490 |
+
global agentic_rag_agent
|
| 491 |
+
|
| 492 |
+
if encoder is None or vector_store is None:
|
| 493 |
+
print("[WARN] Cannot initialize Agentic RAG: encoder or vector_store not available")
|
| 494 |
+
return
|
| 495 |
+
|
| 496 |
+
try:
|
| 497 |
+
from agents.agentic_rag import AgenticRAGAgent
|
| 498 |
+
from langchain_ollama import ChatOllama
|
| 499 |
+
|
| 500 |
+
# Create LLM for the agent
|
| 501 |
+
api_key = os.getenv('OLLAMA_API_KEY')
|
| 502 |
+
if api_key:
|
| 503 |
+
llm = ChatOllama(
|
| 504 |
+
model="gpt-oss:120b",
|
| 505 |
+
base_url="https://ollama.com",
|
| 506 |
+
client_kwargs={
|
| 507 |
+
"headers": {"Authorization": f"Bearer {api_key}"}
|
| 508 |
+
}
|
| 509 |
+
)
|
| 510 |
+
else:
|
| 511 |
+
llm = ChatOllama(model="gpt-oss:120b-cloud")
|
| 512 |
+
|
| 513 |
+
agentic_rag_agent = AgenticRAGAgent(llm, encoder, vector_store)
|
| 514 |
+
print("[OK] Agentic RAG agent initialized")
|
| 515 |
+
|
| 516 |
+
except Exception as e:
|
| 517 |
+
import traceback
|
| 518 |
+
print(f"[WARN] Agentic RAG initialization error: {e}")
|
| 519 |
+
traceback.print_exc()
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
@app.post("/rag/search", response_model=AgenticRAGResponse)
|
| 523 |
+
async def agentic_rag_search(request: AgenticRAGRequest):
|
| 524 |
+
"""
|
| 525 |
+
Agentic RAG search - the agent autonomously explores the vector store.
|
| 526 |
+
|
| 527 |
+
The agent will:
|
| 528 |
+
1. Analyze your query to understand what you're looking for
|
| 529 |
+
2. Explore available categories in the database
|
| 530 |
+
3. Perform semantic and/or filtered searches
|
| 531 |
+
4. Iteratively refine results if needed
|
| 532 |
+
5. Return detailed findings with reasoning
|
| 533 |
+
|
| 534 |
+
Example queries:
|
| 535 |
+
- "Find donors interested in education in Singapore"
|
| 536 |
+
- "Show me corporate donors who focus on environmental causes"
|
| 537 |
+
- "Find volunteers with tech skills available on weekends"
|
| 538 |
+
"""
|
| 539 |
+
global agentic_rag_agent
|
| 540 |
+
|
| 541 |
+
# Lazy initialization if not done yet
|
| 542 |
+
if agentic_rag_agent is None:
|
| 543 |
+
await init_agentic_rag()
|
| 544 |
+
|
| 545 |
+
if agentic_rag_agent is None:
|
| 546 |
+
raise HTTPException(
|
| 547 |
+
status_code=503,
|
| 548 |
+
detail="Agentic RAG not available. Ensure encoder and database are configured."
|
| 549 |
+
)
|
| 550 |
+
|
| 551 |
+
try:
|
| 552 |
+
result = await agentic_rag_agent.search(request.query)
|
| 553 |
+
|
| 554 |
+
return AgenticRAGResponse(
|
| 555 |
+
response=result["response"],
|
| 556 |
+
tool_calls=result["tool_calls"],
|
| 557 |
+
message_count=result["message_count"]
|
| 558 |
+
)
|
| 559 |
+
except Exception as e:
|
| 560 |
+
import traceback
|
| 561 |
+
traceback.print_exc()
|
| 562 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
@app.get("/rag/tools")
|
| 566 |
+
async def list_rag_tools():
|
| 567 |
+
"""List available RAG tools and their descriptions."""
|
| 568 |
+
from tools.rag_tools import RAG_TOOLS
|
| 569 |
+
|
| 570 |
+
tools_info = []
|
| 571 |
+
for tool in RAG_TOOLS:
|
| 572 |
+
tools_info.append({
|
| 573 |
+
"name": tool.name,
|
| 574 |
+
"description": tool.description,
|
| 575 |
+
})
|
| 576 |
+
|
| 577 |
+
return {
|
| 578 |
+
"tools": tools_info,
|
| 579 |
+
"total": len(tools_info)
|
| 580 |
+
}
|
| 581 |
+
|
| 582 |
+
|
| 583 |
+
@app.get("/rag/categories")
|
| 584 |
+
async def get_rag_categories():
|
| 585 |
+
"""Get available categories in the vector store for filtering."""
|
| 586 |
+
if not vector_store:
|
| 587 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 588 |
+
|
| 589 |
+
from tools.rag_tools import list_available_categories, set_rag_dependencies
|
| 590 |
+
|
| 591 |
+
# Ensure dependencies are set
|
| 592 |
+
if encoder and vector_store:
|
| 593 |
+
set_rag_dependencies(encoder, vector_store)
|
| 594 |
+
|
| 595 |
+
try:
|
| 596 |
+
result = await list_available_categories.ainvoke({})
|
| 597 |
+
import json
|
| 598 |
+
return json.loads(result)
|
| 599 |
+
except Exception as e:
|
| 600 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
# ============================================================================
|
| 604 |
+
# Donor Endpoints
|
| 605 |
+
# ============================================================================
|
| 606 |
+
|
| 607 |
+
@app.post("/donors/register", response_model=FormResponse)
|
| 608 |
+
async def register_donor(form: DonorFormRequest):
|
| 609 |
+
"""Register a donor and generate embedding."""
|
| 610 |
+
if not encoder:
|
| 611 |
+
raise HTTPException(status_code=503, detail="Encoder not initialized")
|
| 612 |
+
if not vector_store:
|
| 613 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 614 |
+
|
| 615 |
+
try:
|
| 616 |
+
# Convert form to encoding text
|
| 617 |
+
text = donor_form_to_text(form)
|
| 618 |
+
|
| 619 |
+
# Generate embedding
|
| 620 |
+
embedding = await encoder.encode(text)
|
| 621 |
+
|
| 622 |
+
# Store in database
|
| 623 |
+
form_data = form.model_dump()
|
| 624 |
+
await vector_store.store_embedding(
|
| 625 |
+
form_id=form.id,
|
| 626 |
+
form_type="donor",
|
| 627 |
+
embedding=embedding,
|
| 628 |
+
form_data=form_data
|
| 629 |
+
)
|
| 630 |
+
|
| 631 |
+
return FormResponse(
|
| 632 |
+
id=form.id,
|
| 633 |
+
form_type="donor",
|
| 634 |
+
message="Donor registered successfully",
|
| 635 |
+
embedding_dimension=len(embedding)
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
except Exception as e:
|
| 639 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
@app.post("/donors/recommend", response_model=RecommendResponse)
|
| 643 |
+
async def recommend_donors(request: RecommendRequest):
|
| 644 |
+
"""Find similar donors based on query."""
|
| 645 |
+
if not encoder:
|
| 646 |
+
raise HTTPException(status_code=503, detail="Encoder not initialized")
|
| 647 |
+
if not vector_store:
|
| 648 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 649 |
+
|
| 650 |
+
try:
|
| 651 |
+
# Get query embedding
|
| 652 |
+
if request.form_id:
|
| 653 |
+
# Use existing form's embedding
|
| 654 |
+
existing = await vector_store.get_embedding(request.form_id)
|
| 655 |
+
if not existing:
|
| 656 |
+
raise HTTPException(status_code=404, detail=f"Form {request.form_id} not found")
|
| 657 |
+
# Re-encode for query (could also store raw embedding)
|
| 658 |
+
text = recommend_request_to_text(request)
|
| 659 |
+
query_embedding = await encoder.encode(text)
|
| 660 |
+
else:
|
| 661 |
+
# Generate new embedding from request data
|
| 662 |
+
text = recommend_request_to_text(request)
|
| 663 |
+
query_embedding = await encoder.encode(text)
|
| 664 |
+
|
| 665 |
+
# Find similar donors
|
| 666 |
+
results = await vector_store.find_similar(
|
| 667 |
+
query_embedding=query_embedding,
|
| 668 |
+
form_type="donor",
|
| 669 |
+
limit=request.limit,
|
| 670 |
+
country_filter=request.country_filter,
|
| 671 |
+
exclude_ids=request.exclude_ids if request.exclude_ids else None
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
return RecommendResponse(
|
| 675 |
+
query_id=request.form_id,
|
| 676 |
+
results=[
|
| 677 |
+
RecommendationResult(
|
| 678 |
+
id=r.id,
|
| 679 |
+
form_type=r.form_type,
|
| 680 |
+
score=r.score,
|
| 681 |
+
distance=r.distance,
|
| 682 |
+
form_data=r.form_data
|
| 683 |
+
)
|
| 684 |
+
for r in results
|
| 685 |
+
],
|
| 686 |
+
total_found=len(results)
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
except HTTPException:
|
| 690 |
+
raise
|
| 691 |
+
except Exception as e:
|
| 692 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
# ============================================================================
|
| 696 |
+
# Volunteer Endpoints
|
| 697 |
+
# ============================================================================
|
| 698 |
+
|
| 699 |
+
@app.post("/volunteers/register", response_model=FormResponse)
|
| 700 |
+
async def register_volunteer(form: VolunteerFormRequest):
|
| 701 |
+
"""Register a volunteer and generate embedding."""
|
| 702 |
+
if not encoder:
|
| 703 |
+
raise HTTPException(status_code=503, detail="Encoder not initialized")
|
| 704 |
+
if not vector_store:
|
| 705 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 706 |
+
|
| 707 |
+
try:
|
| 708 |
+
# Convert form to encoding text
|
| 709 |
+
text = volunteer_form_to_text(form)
|
| 710 |
+
|
| 711 |
+
# Generate embedding
|
| 712 |
+
embedding = await encoder.encode(text)
|
| 713 |
+
|
| 714 |
+
# Store in database
|
| 715 |
+
form_data = form.model_dump()
|
| 716 |
+
await vector_store.store_embedding(
|
| 717 |
+
form_id=form.id,
|
| 718 |
+
form_type="volunteer",
|
| 719 |
+
embedding=embedding,
|
| 720 |
+
form_data=form_data
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
return FormResponse(
|
| 724 |
+
id=form.id,
|
| 725 |
+
form_type="volunteer",
|
| 726 |
+
message="Volunteer registered successfully",
|
| 727 |
+
embedding_dimension=len(embedding)
|
| 728 |
+
)
|
| 729 |
+
|
| 730 |
+
except Exception as e:
|
| 731 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 732 |
+
|
| 733 |
+
|
| 734 |
+
@app.post("/volunteers/recommend", response_model=RecommendResponse)
|
| 735 |
+
async def recommend_volunteers(request: RecommendRequest):
|
| 736 |
+
"""Find similar volunteers based on query."""
|
| 737 |
+
if not encoder:
|
| 738 |
+
raise HTTPException(status_code=503, detail="Encoder not initialized")
|
| 739 |
+
if not vector_store:
|
| 740 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 741 |
+
|
| 742 |
+
try:
|
| 743 |
+
# Generate query embedding
|
| 744 |
+
text = recommend_request_to_text(request)
|
| 745 |
+
query_embedding = await encoder.encode(text)
|
| 746 |
+
|
| 747 |
+
# Find similar volunteers
|
| 748 |
+
results = await vector_store.find_similar(
|
| 749 |
+
query_embedding=query_embedding,
|
| 750 |
+
form_type="volunteer",
|
| 751 |
+
limit=request.limit,
|
| 752 |
+
country_filter=request.country_filter,
|
| 753 |
+
exclude_ids=request.exclude_ids if request.exclude_ids else None
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
return RecommendResponse(
|
| 757 |
+
query_id=request.form_id,
|
| 758 |
+
results=[
|
| 759 |
+
RecommendationResult(
|
| 760 |
+
id=r.id,
|
| 761 |
+
form_type=r.form_type,
|
| 762 |
+
score=r.score,
|
| 763 |
+
distance=r.distance,
|
| 764 |
+
form_data=r.form_data
|
| 765 |
+
)
|
| 766 |
+
for r in results
|
| 767 |
+
],
|
| 768 |
+
total_found=len(results)
|
| 769 |
+
)
|
| 770 |
+
|
| 771 |
+
except Exception as e:
|
| 772 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 773 |
+
|
| 774 |
+
|
| 775 |
+
# ============================================================================
|
| 776 |
+
# Form Management Endpoints
|
| 777 |
+
# ============================================================================
|
| 778 |
+
|
| 779 |
+
@app.get("/forms/{form_id}")
|
| 780 |
+
async def get_form(form_id: str):
|
| 781 |
+
"""Get a stored form by ID."""
|
| 782 |
+
if not vector_store:
|
| 783 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 784 |
+
|
| 785 |
+
result = await vector_store.get_embedding(form_id)
|
| 786 |
+
if not result:
|
| 787 |
+
raise HTTPException(status_code=404, detail=f"Form {form_id} not found")
|
| 788 |
+
|
| 789 |
+
return {
|
| 790 |
+
"id": result.id,
|
| 791 |
+
"form_type": result.form_type,
|
| 792 |
+
"form_data": result.form_data
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
@app.delete("/forms/{form_id}")
|
| 797 |
+
async def delete_form(form_id: str):
|
| 798 |
+
"""Delete a form by ID."""
|
| 799 |
+
if not vector_store:
|
| 800 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 801 |
+
|
| 802 |
+
deleted = await vector_store.delete_embedding(form_id)
|
| 803 |
+
if not deleted:
|
| 804 |
+
raise HTTPException(status_code=404, detail=f"Form {form_id} not found")
|
| 805 |
+
|
| 806 |
+
return {"message": f"Form {form_id} deleted successfully"}
|
| 807 |
+
|
| 808 |
+
|
| 809 |
+
@app.get("/forms/stats/summary", response_model=StatsResponse)
|
| 810 |
+
async def get_form_stats():
|
| 811 |
+
"""Get form counts by type."""
|
| 812 |
+
if not vector_store:
|
| 813 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 814 |
+
|
| 815 |
+
try:
|
| 816 |
+
counts = await vector_store.count_by_type()
|
| 817 |
+
return StatsResponse(
|
| 818 |
+
donor=counts.get("donor", 0),
|
| 819 |
+
volunteer=counts.get("volunteer", 0),
|
| 820 |
+
total=counts.get("total", 0)
|
| 821 |
+
)
|
| 822 |
+
except Exception as e:
|
| 823 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 824 |
+
|
| 825 |
+
|
| 826 |
+
# ============================================================================
|
| 827 |
+
# Cause-based Search Endpoint
|
| 828 |
+
# ============================================================================
|
| 829 |
+
|
| 830 |
+
@app.post("/forms/search/causes")
|
| 831 |
+
async def search_by_causes(
|
| 832 |
+
causes: List[str],
|
| 833 |
+
limit: int = 20
|
| 834 |
+
):
|
| 835 |
+
"""Search forms by causes with embedding ranking."""
|
| 836 |
+
if not encoder:
|
| 837 |
+
raise HTTPException(status_code=503, detail="Encoder not initialized")
|
| 838 |
+
if not vector_store:
|
| 839 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 840 |
+
|
| 841 |
+
try:
|
| 842 |
+
# Create a synthetic query embedding for ranking
|
| 843 |
+
query_text = f"Causes interested in: {', '.join(causes)}"
|
| 844 |
+
query_embedding = await encoder.encode(query_text)
|
| 845 |
+
|
| 846 |
+
results = await vector_store.find_by_causes(
|
| 847 |
+
target_causes=causes,
|
| 848 |
+
query_embedding=query_embedding,
|
| 849 |
+
limit=limit
|
| 850 |
+
)
|
| 851 |
+
|
| 852 |
+
return {
|
| 853 |
+
"causes": causes,
|
| 854 |
+
"results": [
|
| 855 |
+
{
|
| 856 |
+
"id": r.id,
|
| 857 |
+
"form_type": r.form_type,
|
| 858 |
+
"score": r.score,
|
| 859 |
+
"distance": r.distance,
|
| 860 |
+
"form_data": r.form_data
|
| 861 |
+
}
|
| 862 |
+
for r in results
|
| 863 |
+
],
|
| 864 |
+
"total_found": len(results)
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
+
except Exception as e:
|
| 868 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 869 |
+
|
| 870 |
+
|
| 871 |
+
# ============================================================================
|
| 872 |
+
# GIS & Client Targeting Endpoints
|
| 873 |
+
# ============================================================================
|
| 874 |
+
|
| 875 |
+
# Mock Singpass data for different organization profiles
|
| 876 |
+
MOCK_SINGPASS_PROFILES = {
|
| 877 |
+
"org_001": SingpassMockData(
|
| 878 |
+
name="Sarah Tan Wei Ling",
|
| 879 |
+
nric_masked="S****567A",
|
| 880 |
+
email="sarah.tan@example.org",
|
| 881 |
+
mobile="+65 9123 4567",
|
| 882 |
+
registered_address="123 Orchard Road, #12-01, Singapore 238867",
|
| 883 |
+
planning_area="orchard",
|
| 884 |
+
organization_name="Hearts of Hope Foundation",
|
| 885 |
+
organization_uen="201912345K",
|
| 886 |
+
organization_type="charity",
|
| 887 |
+
),
|
| 888 |
+
"org_002": SingpassMockData(
|
| 889 |
+
name="Ahmad bin Ibrahim",
|
| 890 |
+
nric_masked="S****234B",
|
| 891 |
+
email="ahmad.ibrahim@greensg.org",
|
| 892 |
+
mobile="+65 9876 5432",
|
| 893 |
+
registered_address="45 Jurong East Ave 1, #05-12, Singapore 609788",
|
| 894 |
+
planning_area="jurong_east",
|
| 895 |
+
organization_name="Green Singapore Initiative",
|
| 896 |
+
organization_uen="201823456M",
|
| 897 |
+
organization_type="ngo",
|
| 898 |
+
),
|
| 899 |
+
"org_003": SingpassMockData(
|
| 900 |
+
name="Lee Mei Hua",
|
| 901 |
+
nric_masked="S****789C",
|
| 902 |
+
email="meihua@eldercare.sg",
|
| 903 |
+
mobile="+65 8765 4321",
|
| 904 |
+
registered_address="78 Toa Payoh Lorong 1, #08-22, Singapore 310078",
|
| 905 |
+
planning_area="toa_payoh",
|
| 906 |
+
organization_name="ElderCare Singapore",
|
| 907 |
+
organization_uen="200934567N",
|
| 908 |
+
organization_type="social_enterprise",
|
| 909 |
+
),
|
| 910 |
+
}
|
| 911 |
+
|
| 912 |
+
|
| 913 |
+
@app.get("/singpass/mock/{profile_id}", response_model=SingpassMockData)
|
| 914 |
+
async def get_singpass_mock_data(profile_id: str):
|
| 915 |
+
"""
|
| 916 |
+
Get mock Singpass data for autofill demonstration.
|
| 917 |
+
|
| 918 |
+
Available profiles: org_001, org_002, org_003
|
| 919 |
+
"""
|
| 920 |
+
if profile_id not in MOCK_SINGPASS_PROFILES:
|
| 921 |
+
# Return a random profile if not found
|
| 922 |
+
profile_id = "org_001"
|
| 923 |
+
|
| 924 |
+
return MOCK_SINGPASS_PROFILES[profile_id]
|
| 925 |
+
|
| 926 |
+
|
| 927 |
+
@app.get("/singpass/mock", response_model=Dict[str, SingpassMockData])
|
| 928 |
+
async def list_singpass_mock_profiles():
|
| 929 |
+
"""List all available mock Singpass profiles."""
|
| 930 |
+
return MOCK_SINGPASS_PROFILES
|
| 931 |
+
|
| 932 |
+
|
| 933 |
+
@app.get("/planning-areas")
|
| 934 |
+
async def get_planning_areas():
|
| 935 |
+
"""Get all Singapore planning areas with coordinates."""
|
| 936 |
+
from recommender.gis_recommender import PLANNING_AREAS
|
| 937 |
+
|
| 938 |
+
return PLANNING_AREAS
|
| 939 |
+
|
| 940 |
+
|
| 941 |
+
@app.get("/housing-types")
|
| 942 |
+
async def get_housing_types():
|
| 943 |
+
"""Get all housing types with income proxy scores."""
|
| 944 |
+
from recommender.gis_recommender import HOUSING_INCOME_PROXY, HousingType
|
| 945 |
+
|
| 946 |
+
return {
|
| 947 |
+
"types": [h.value for h in HousingType],
|
| 948 |
+
"income_proxy": {h.value: score for h, score in HOUSING_INCOME_PROXY.items()},
|
| 949 |
+
}
|
| 950 |
+
|
| 951 |
+
|
| 952 |
+
@app.post("/clients/register", response_model=FormResponse)
|
| 953 |
+
async def register_client(profile: ClientProfileRequest):
|
| 954 |
+
"""
|
| 955 |
+
Register a client profile with spatial and behavioral data.
|
| 956 |
+
|
| 957 |
+
This creates an embedding combining interests/causes with spatial context.
|
| 958 |
+
"""
|
| 959 |
+
if not encoder:
|
| 960 |
+
raise HTTPException(status_code=503, detail="Encoder not initialized")
|
| 961 |
+
if not vector_store:
|
| 962 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 963 |
+
|
| 964 |
+
try:
|
| 965 |
+
from recommender.gis_recommender import ClientProfile, HousingType
|
| 966 |
+
|
| 967 |
+
# Create client profile
|
| 968 |
+
client = ClientProfile(
|
| 969 |
+
user_id=profile.user_id,
|
| 970 |
+
coordinates=tuple(profile.coordinates),
|
| 971 |
+
planning_area=profile.planning_area,
|
| 972 |
+
housing_type=HousingType(profile.housing_type),
|
| 973 |
+
interests=profile.interests,
|
| 974 |
+
causes=profile.causes,
|
| 975 |
+
preferred_language=profile.preferred_language,
|
| 976 |
+
is_donor=profile.is_donor,
|
| 977 |
+
total_donated=profile.total_donated,
|
| 978 |
+
donation_count=profile.donation_count,
|
| 979 |
+
age_range=profile.age_range,
|
| 980 |
+
)
|
| 981 |
+
|
| 982 |
+
# Generate embedding
|
| 983 |
+
text = client.to_embedding_text()
|
| 984 |
+
embedding = await encoder.encode(text)
|
| 985 |
+
|
| 986 |
+
# Store in database
|
| 987 |
+
form_data = client.to_dict()
|
| 988 |
+
form_data["country"] = "SG" # For existing filter compatibility
|
| 989 |
+
|
| 990 |
+
await vector_store.store_embedding(
|
| 991 |
+
form_id=profile.user_id,
|
| 992 |
+
form_type="client",
|
| 993 |
+
embedding=embedding,
|
| 994 |
+
form_data=form_data,
|
| 995 |
+
)
|
| 996 |
+
|
| 997 |
+
return FormResponse(
|
| 998 |
+
id=profile.user_id,
|
| 999 |
+
form_type="client",
|
| 1000 |
+
message="Client profile registered successfully",
|
| 1001 |
+
embedding_dimension=len(embedding),
|
| 1002 |
+
)
|
| 1003 |
+
|
| 1004 |
+
except Exception as e:
|
| 1005 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1006 |
+
|
| 1007 |
+
|
| 1008 |
+
@app.post("/clients/lookalike", response_model=LookalikeResponse)
|
| 1009 |
+
async def find_lookalike_clients(request: LookalikeRequest):
|
| 1010 |
+
"""
|
| 1011 |
+
Find lookalike clients (potential donors) based on a seed profile.
|
| 1012 |
+
|
| 1013 |
+
This uses the GIS recommender with hybrid semantic-spatial matching:
|
| 1014 |
+
1. Find registered donors from database via vector search
|
| 1015 |
+
2. Apply spatial/housing filters
|
| 1016 |
+
3. Score using tiered targeting (vector + spatial proxy + proximity)
|
| 1017 |
+
4. Fall back to mock data if database has insufficient results
|
| 1018 |
+
5. Return results with optional GeoJSON for mapping
|
| 1019 |
+
|
| 1020 |
+
Note: Searches BOTH donors (from /donors/register) and clients
|
| 1021 |
+
(from /clients/register) to find potential matches.
|
| 1022 |
+
"""
|
| 1023 |
+
try:
|
| 1024 |
+
from recommender.gis_recommender import (
|
| 1025 |
+
ClientProfile,
|
| 1026 |
+
HousingType,
|
| 1027 |
+
GISRecommender,
|
| 1028 |
+
generate_seed_donor_profile,
|
| 1029 |
+
generate_mock_clients,
|
| 1030 |
+
)
|
| 1031 |
+
|
| 1032 |
+
# Create seed profile from request
|
| 1033 |
+
seed = generate_seed_donor_profile(
|
| 1034 |
+
cause=request.seed_causes[0] if request.seed_causes else "education"
|
| 1035 |
+
)
|
| 1036 |
+
seed.causes = request.seed_causes
|
| 1037 |
+
seed.interests = request.seed_interests
|
| 1038 |
+
|
| 1039 |
+
# Update seed coordinates if planning area specified
|
| 1040 |
+
if request.planning_area_filter:
|
| 1041 |
+
from recommender.gis_recommender import PLANNING_AREAS
|
| 1042 |
+
|
| 1043 |
+
if request.planning_area_filter in PLANNING_AREAS:
|
| 1044 |
+
area = PLANNING_AREAS[request.planning_area_filter]
|
| 1045 |
+
seed.coordinates = (area["lat"], area["lng"])
|
| 1046 |
+
seed.planning_area = request.planning_area_filter
|
| 1047 |
+
|
| 1048 |
+
# Regenerate embeddings for updated seed
|
| 1049 |
+
seed.embedding = None # Force regeneration
|
| 1050 |
+
local_recommender = GISRecommender()
|
| 1051 |
+
seed.embedding = local_recommender._generate_fallback_embedding(seed)
|
| 1052 |
+
seed.compute_reduced_embeddings()
|
| 1053 |
+
|
| 1054 |
+
# Convert housing type filter
|
| 1055 |
+
housing_filter = None
|
| 1056 |
+
if request.housing_type_filter:
|
| 1057 |
+
housing_filter = [HousingType(h) for h in request.housing_type_filter]
|
| 1058 |
+
|
| 1059 |
+
scored_clients = []
|
| 1060 |
+
db_results_count = 0
|
| 1061 |
+
|
| 1062 |
+
# Try database search first if available
|
| 1063 |
+
if gis_recommender and encoder and vector_store:
|
| 1064 |
+
try:
|
| 1065 |
+
print(
|
| 1066 |
+
f"Searching database for donors matching causes: {request.seed_causes}"
|
| 1067 |
+
)
|
| 1068 |
+
scored_clients = await gis_recommender.find_lookalikes(
|
| 1069 |
+
seed_profile=seed,
|
| 1070 |
+
k=request.limit * 2, # Get more to allow for filtering
|
| 1071 |
+
planning_area_filter=None, # Remove strict filter for DB search
|
| 1072 |
+
housing_type_filter=None, # Filter after retrieval
|
| 1073 |
+
use_hybrid=False,
|
| 1074 |
+
)
|
| 1075 |
+
db_results_count = len(scored_clients)
|
| 1076 |
+
print(f"Found {db_results_count} donors/clients from database")
|
| 1077 |
+
|
| 1078 |
+
# Apply filters after retrieval for more flexible matching
|
| 1079 |
+
if request.planning_area_filter:
|
| 1080 |
+
scored_clients = [
|
| 1081 |
+
sc
|
| 1082 |
+
for sc in scored_clients
|
| 1083 |
+
if sc.client.planning_area == request.planning_area_filter
|
| 1084 |
+
]
|
| 1085 |
+
|
| 1086 |
+
if housing_filter:
|
| 1087 |
+
scored_clients = [
|
| 1088 |
+
sc
|
| 1089 |
+
for sc in scored_clients
|
| 1090 |
+
if sc.client.housing_type in housing_filter
|
| 1091 |
+
]
|
| 1092 |
+
|
| 1093 |
+
except Exception as e:
|
| 1094 |
+
print(f"Database search failed: {e}")
|
| 1095 |
+
import traceback
|
| 1096 |
+
|
| 1097 |
+
traceback.print_exc()
|
| 1098 |
+
|
| 1099 |
+
# If insufficient results from database, supplement with mock data
|
| 1100 |
+
min_results = max(request.limit // 2, 10) # At least half the requested or 10
|
| 1101 |
+
if len(scored_clients) < min_results:
|
| 1102 |
+
print(f"Only {len(scored_clients)} from DB, supplementing with mock data")
|
| 1103 |
+
|
| 1104 |
+
# Generate mock candidates
|
| 1105 |
+
fallback_candidates = generate_mock_clients(150)
|
| 1106 |
+
|
| 1107 |
+
# Filter by causes for relevance
|
| 1108 |
+
if request.seed_causes:
|
| 1109 |
+
cause_matched = [
|
| 1110 |
+
c
|
| 1111 |
+
for c in fallback_candidates
|
| 1112 |
+
if any(cause in c.causes for cause in request.seed_causes)
|
| 1113 |
+
]
|
| 1114 |
+
if len(cause_matched) >= 20:
|
| 1115 |
+
fallback_candidates = cause_matched
|
| 1116 |
+
|
| 1117 |
+
# Use hybrid matching on mock data
|
| 1118 |
+
mock_results = local_recommender.find_lookalikes_hybrid(
|
| 1119 |
+
seed_profile=seed,
|
| 1120 |
+
candidates=fallback_candidates,
|
| 1121 |
+
k=request.limit - len(scored_clients),
|
| 1122 |
+
planning_area_filter=request.planning_area_filter,
|
| 1123 |
+
housing_type_filter=housing_filter,
|
| 1124 |
+
)
|
| 1125 |
+
|
| 1126 |
+
scored_clients.extend(mock_results)
|
| 1127 |
+
print(
|
| 1128 |
+
f"Added {len(mock_results)} mock results, total: {len(scored_clients)}"
|
| 1129 |
+
)
|
| 1130 |
+
|
| 1131 |
+
# Sort combined results by score
|
| 1132 |
+
scored_clients.sort(key=lambda x: x.final_score, reverse=True)
|
| 1133 |
+
scored_clients = scored_clients[: request.limit]
|
| 1134 |
+
|
| 1135 |
+
# Apply tiered targeting with relaxed min_score for small datasets
|
| 1136 |
+
effective_min_score = max(0, request.min_score - 0.1) # Relax slightly
|
| 1137 |
+
tiered = local_recommender.apply_tiered_targeting(
|
| 1138 |
+
scored_clients, min_score=effective_min_score
|
| 1139 |
+
)
|
| 1140 |
+
|
| 1141 |
+
# Convert to response format
|
| 1142 |
+
def to_response(sc):
|
| 1143 |
+
return ScoredClientResponse(
|
| 1144 |
+
user_id=sc.client.user_id,
|
| 1145 |
+
planning_area=sc.client.planning_area,
|
| 1146 |
+
housing_type=sc.client.housing_type.value,
|
| 1147 |
+
causes=sc.client.causes,
|
| 1148 |
+
interests=sc.client.interests,
|
| 1149 |
+
is_donor=sc.client.is_donor,
|
| 1150 |
+
final_score=round(sc.final_score, 3),
|
| 1151 |
+
vector_similarity=round(sc.vector_similarity_score, 3),
|
| 1152 |
+
spatial_proxy=round(sc.spatial_proxy_score, 3),
|
| 1153 |
+
proximity=round(sc.proximity_score, 3),
|
| 1154 |
+
coordinates=(
|
| 1155 |
+
list(sc.client.coordinates) if request.include_geojson else None
|
| 1156 |
+
),
|
| 1157 |
+
)
|
| 1158 |
+
|
| 1159 |
+
tiers_response = {
|
| 1160 |
+
"tier_1": [to_response(sc) for sc in tiered["tier_1"]],
|
| 1161 |
+
"tier_2": [to_response(sc) for sc in tiered["tier_2"]],
|
| 1162 |
+
"tier_3": [to_response(sc) for sc in tiered["tier_3"]],
|
| 1163 |
+
}
|
| 1164 |
+
|
| 1165 |
+
# Generate GeoJSON if requested
|
| 1166 |
+
geojson = None
|
| 1167 |
+
if request.include_geojson:
|
| 1168 |
+
all_clients = tiered["tier_1"] + tiered["tier_2"] + tiered["tier_3"]
|
| 1169 |
+
geojson = local_recommender.to_geojson(all_clients)
|
| 1170 |
+
|
| 1171 |
+
total = sum(len(t) for t in tiered.values())
|
| 1172 |
+
|
| 1173 |
+
return LookalikeResponse(
|
| 1174 |
+
seed_causes=request.seed_causes,
|
| 1175 |
+
total_found=total,
|
| 1176 |
+
tiers=tiers_response,
|
| 1177 |
+
geojson=geojson,
|
| 1178 |
+
)
|
| 1179 |
+
|
| 1180 |
+
except Exception as e:
|
| 1181 |
+
import traceback
|
| 1182 |
+
|
| 1183 |
+
traceback.print_exc()
|
| 1184 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1185 |
+
|
| 1186 |
+
|
| 1187 |
+
async def _get_mock_lookalike_response(request: LookalikeRequest) -> LookalikeResponse:
|
| 1188 |
+
"""Generate mock lookalike response when GIS recommender unavailable."""
|
| 1189 |
+
from recommender.gis_recommender import (
|
| 1190 |
+
generate_mock_clients,
|
| 1191 |
+
PLANNING_AREAS,
|
| 1192 |
+
HOUSING_INCOME_PROXY,
|
| 1193 |
+
HousingType,
|
| 1194 |
+
)
|
| 1195 |
+
|
| 1196 |
+
# Generate mock clients
|
| 1197 |
+
mock_clients = generate_mock_clients(100)
|
| 1198 |
+
|
| 1199 |
+
# Filter by causes
|
| 1200 |
+
filtered = [
|
| 1201 |
+
c
|
| 1202 |
+
for c in mock_clients
|
| 1203 |
+
if any(cause in c.causes for cause in request.seed_causes)
|
| 1204 |
+
]
|
| 1205 |
+
|
| 1206 |
+
# Apply planning area filter
|
| 1207 |
+
if request.planning_area_filter:
|
| 1208 |
+
filtered = [
|
| 1209 |
+
c for c in filtered if c.planning_area == request.planning_area_filter
|
| 1210 |
+
]
|
| 1211 |
+
|
| 1212 |
+
# Score and sort
|
| 1213 |
+
scored = []
|
| 1214 |
+
for client in filtered[: request.limit]:
|
| 1215 |
+
# Calculate mock scores
|
| 1216 |
+
cause_match = len(set(client.causes) & set(request.seed_causes)) / max(
|
| 1217 |
+
len(request.seed_causes), 1
|
| 1218 |
+
)
|
| 1219 |
+
spatial_score = HOUSING_INCOME_PROXY.get(client.housing_type, 0.5)
|
| 1220 |
+
final_score = 0.5 * cause_match + 0.3 * spatial_score + 0.2 * 0.5
|
| 1221 |
+
|
| 1222 |
+
scored.append(
|
| 1223 |
+
{
|
| 1224 |
+
"client": client,
|
| 1225 |
+
"final_score": final_score,
|
| 1226 |
+
"vector_similarity": cause_match,
|
| 1227 |
+
"spatial_proxy": spatial_score,
|
| 1228 |
+
"proximity": 0.5,
|
| 1229 |
+
}
|
| 1230 |
+
)
|
| 1231 |
+
|
| 1232 |
+
scored.sort(key=lambda x: x["final_score"], reverse=True)
|
| 1233 |
+
|
| 1234 |
+
# Apply min score filter
|
| 1235 |
+
scored = [s for s in scored if s["final_score"] >= request.min_score]
|
| 1236 |
+
|
| 1237 |
+
# Create tiers
|
| 1238 |
+
n = len(scored)
|
| 1239 |
+
tier_size = max(n // 3, 1)
|
| 1240 |
+
|
| 1241 |
+
def to_response(s):
|
| 1242 |
+
c = s["client"]
|
| 1243 |
+
return ScoredClientResponse(
|
| 1244 |
+
user_id=c.user_id,
|
| 1245 |
+
planning_area=c.planning_area,
|
| 1246 |
+
housing_type=c.housing_type.value,
|
| 1247 |
+
causes=c.causes,
|
| 1248 |
+
interests=c.interests,
|
| 1249 |
+
is_donor=c.is_donor,
|
| 1250 |
+
final_score=round(s["final_score"], 3),
|
| 1251 |
+
vector_similarity=round(s["vector_similarity"], 3),
|
| 1252 |
+
spatial_proxy=round(s["spatial_proxy"], 3),
|
| 1253 |
+
proximity=round(s["proximity"], 3),
|
| 1254 |
+
coordinates=list(c.coordinates) if request.include_geojson else None,
|
| 1255 |
+
)
|
| 1256 |
+
|
| 1257 |
+
tiers = {
|
| 1258 |
+
"tier_1": [to_response(s) for s in scored[:tier_size]],
|
| 1259 |
+
"tier_2": [to_response(s) for s in scored[tier_size : tier_size * 2]],
|
| 1260 |
+
"tier_3": [to_response(s) for s in scored[tier_size * 2 :]],
|
| 1261 |
+
}
|
| 1262 |
+
|
| 1263 |
+
# Generate GeoJSON
|
| 1264 |
+
geojson = None
|
| 1265 |
+
if request.include_geojson:
|
| 1266 |
+
features = []
|
| 1267 |
+
for s in scored:
|
| 1268 |
+
c = s["client"]
|
| 1269 |
+
features.append(
|
| 1270 |
+
{
|
| 1271 |
+
"type": "Feature",
|
| 1272 |
+
"geometry": {
|
| 1273 |
+
"type": "Point",
|
| 1274 |
+
"coordinates": [
|
| 1275 |
+
round(c.coordinates[1], 3),
|
| 1276 |
+
round(c.coordinates[0], 3),
|
| 1277 |
+
],
|
| 1278 |
+
},
|
| 1279 |
+
"properties": {
|
| 1280 |
+
"user_id": c.user_id,
|
| 1281 |
+
"planning_area": c.planning_area,
|
| 1282 |
+
"housing_type": c.housing_type.value,
|
| 1283 |
+
"causes": c.causes,
|
| 1284 |
+
"is_donor": c.is_donor,
|
| 1285 |
+
"final_score": round(s["final_score"], 3),
|
| 1286 |
+
},
|
| 1287 |
+
}
|
| 1288 |
+
)
|
| 1289 |
+
geojson = {"type": "FeatureCollection", "features": features}
|
| 1290 |
+
|
| 1291 |
+
return LookalikeResponse(
|
| 1292 |
+
seed_causes=request.seed_causes,
|
| 1293 |
+
total_found=len(scored),
|
| 1294 |
+
tiers=tiers,
|
| 1295 |
+
geojson=geojson,
|
| 1296 |
+
)
|
| 1297 |
+
|
| 1298 |
+
|
| 1299 |
+
@app.post("/clients/seed-mock-data")
|
| 1300 |
+
async def seed_mock_client_data(count: int = 100):
|
| 1301 |
+
"""
|
| 1302 |
+
Seed the database with mock client profiles for testing.
|
| 1303 |
+
|
| 1304 |
+
This populates the vector store with realistic Singapore client data.
|
| 1305 |
+
"""
|
| 1306 |
+
if not encoder:
|
| 1307 |
+
raise HTTPException(status_code=503, detail="Encoder not initialized")
|
| 1308 |
+
if not vector_store:
|
| 1309 |
+
raise HTTPException(status_code=503, detail="Database not connected")
|
| 1310 |
+
|
| 1311 |
+
try:
|
| 1312 |
+
from recommender.gis_recommender import generate_mock_clients
|
| 1313 |
+
|
| 1314 |
+
clients = generate_mock_clients(count)
|
| 1315 |
+
registered = 0
|
| 1316 |
+
|
| 1317 |
+
for client in clients:
|
| 1318 |
+
text = client.to_embedding_text()
|
| 1319 |
+
embedding = await encoder.encode(text)
|
| 1320 |
+
|
| 1321 |
+
form_data = client.to_dict()
|
| 1322 |
+
form_data["country"] = "SG"
|
| 1323 |
+
|
| 1324 |
+
await vector_store.store_embedding(
|
| 1325 |
+
form_id=client.user_id,
|
| 1326 |
+
form_type="client",
|
| 1327 |
+
embedding=embedding,
|
| 1328 |
+
form_data=form_data,
|
| 1329 |
+
)
|
| 1330 |
+
registered += 1
|
| 1331 |
+
|
| 1332 |
+
return {
|
| 1333 |
+
"message": f"Seeded {registered} mock client profiles",
|
| 1334 |
+
"count": registered,
|
| 1335 |
+
}
|
| 1336 |
+
|
| 1337 |
+
except Exception as e:
|
| 1338 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1339 |
+
|
| 1340 |
+
|
| 1341 |
+
@app.get("/debug/database-stats")
|
| 1342 |
+
async def get_database_stats():
|
| 1343 |
+
"""
|
| 1344 |
+
Debug endpoint to check what's stored in the vector database.
|
| 1345 |
+
|
| 1346 |
+
Returns counts of donors, volunteers, and clients in the database.
|
| 1347 |
+
"""
|
| 1348 |
+
if not vector_store:
|
| 1349 |
+
return {"error": "Database not connected", "stats": None}
|
| 1350 |
+
|
| 1351 |
+
try:
|
| 1352 |
+
async with vector_store.pool.connection() as conn:
|
| 1353 |
+
async with conn.cursor() as cur:
|
| 1354 |
+
# Count by form_type
|
| 1355 |
+
await cur.execute(
|
| 1356 |
+
"""
|
| 1357 |
+
SELECT
|
| 1358 |
+
metadata->>'form_type' as form_type,
|
| 1359 |
+
COUNT(*) as count
|
| 1360 |
+
FROM my_embeddings
|
| 1361 |
+
GROUP BY metadata->>'form_type'
|
| 1362 |
+
ORDER BY count DESC
|
| 1363 |
+
"""
|
| 1364 |
+
)
|
| 1365 |
+
type_counts = await cur.fetchall()
|
| 1366 |
+
|
| 1367 |
+
# Get sample entries
|
| 1368 |
+
await cur.execute(
|
| 1369 |
+
"""
|
| 1370 |
+
SELECT source_id, metadata->>'form_type',
|
| 1371 |
+
LEFT(text_content::text, 200) as preview
|
| 1372 |
+
FROM my_embeddings
|
| 1373 |
+
ORDER BY id DESC
|
| 1374 |
+
LIMIT 10
|
| 1375 |
+
"""
|
| 1376 |
+
)
|
| 1377 |
+
recent = await cur.fetchall()
|
| 1378 |
+
|
| 1379 |
+
return {
|
| 1380 |
+
"connected": True,
|
| 1381 |
+
"form_type_counts": {row[0]: row[1] for row in type_counts},
|
| 1382 |
+
"total_entries": sum(row[1] for row in type_counts),
|
| 1383 |
+
"recent_entries": [
|
| 1384 |
+
{"id": row[0], "form_type": row[1], "preview": row[2]} for row in recent
|
| 1385 |
+
],
|
| 1386 |
+
}
|
| 1387 |
+
except Exception as e:
|
| 1388 |
+
return {"error": str(e), "stats": None}
|
| 1389 |
+
|
| 1390 |
+
|
| 1391 |
+
@app.get("/clients/map-demographics")
|
| 1392 |
+
async def get_map_demographics(
|
| 1393 |
+
causes: Optional[str] = None, # Comma-separated causes
|
| 1394 |
+
include_donors: bool = True,
|
| 1395 |
+
include_clients: bool = True,
|
| 1396 |
+
):
|
| 1397 |
+
"""
|
| 1398 |
+
Get aggregated demographics data for Singapore map visualization.
|
| 1399 |
+
|
| 1400 |
+
Returns:
|
| 1401 |
+
- Planning area aggregates (donor counts, cause distribution, housing breakdown)
|
| 1402 |
+
- Individual donor/client points with coordinates
|
| 1403 |
+
- Demographics summary for clusters
|
| 1404 |
+
"""
|
| 1405 |
+
from recommender.gis_recommender import (
|
| 1406 |
+
PLANNING_AREAS,
|
| 1407 |
+
HousingType,
|
| 1408 |
+
HOUSING_INCOME_PROXY,
|
| 1409 |
+
)
|
| 1410 |
+
|
| 1411 |
+
if not vector_store:
|
| 1412 |
+
# Return mock data if database not available
|
| 1413 |
+
return await _generate_mock_map_demographics(causes)
|
| 1414 |
+
|
| 1415 |
+
try:
|
| 1416 |
+
cause_list = causes.split(",") if causes else None
|
| 1417 |
+
|
| 1418 |
+
# Query all donors and clients from database
|
| 1419 |
+
all_entries = []
|
| 1420 |
+
|
| 1421 |
+
if include_donors:
|
| 1422 |
+
donor_results = await vector_store.find_by_form_type("donor", limit=500)
|
| 1423 |
+
all_entries.extend(donor_results)
|
| 1424 |
+
|
| 1425 |
+
if include_clients:
|
| 1426 |
+
client_results = await vector_store.find_by_form_type("client", limit=500)
|
| 1427 |
+
all_entries.extend(client_results)
|
| 1428 |
+
|
| 1429 |
+
# Aggregate by planning area
|
| 1430 |
+
area_stats = {}
|
| 1431 |
+
individual_points = []
|
| 1432 |
+
|
| 1433 |
+
for entry in all_entries:
|
| 1434 |
+
form_data = (
|
| 1435 |
+
entry.form_data
|
| 1436 |
+
if hasattr(entry, "form_data")
|
| 1437 |
+
else entry.get("form_data", {})
|
| 1438 |
+
)
|
| 1439 |
+
entry_id = entry.id if hasattr(entry, "id") else entry.get("id", "")
|
| 1440 |
+
form_type = (
|
| 1441 |
+
entry.form_type
|
| 1442 |
+
if hasattr(entry, "form_type")
|
| 1443 |
+
else entry.get("form_type", "")
|
| 1444 |
+
)
|
| 1445 |
+
|
| 1446 |
+
# Get planning area
|
| 1447 |
+
planning_area = form_data.get("planning_area", "unknown")
|
| 1448 |
+
if planning_area == "unknown" and form_data.get("country") == "SG":
|
| 1449 |
+
# Infer planning area from ID hash for donors without explicit area
|
| 1450 |
+
import hashlib
|
| 1451 |
+
|
| 1452 |
+
area_list = list(PLANNING_AREAS.keys())
|
| 1453 |
+
idx = int(hashlib.md5(entry_id.encode()).hexdigest(), 16) % len(
|
| 1454 |
+
area_list
|
| 1455 |
+
)
|
| 1456 |
+
planning_area = area_list[idx]
|
| 1457 |
+
|
| 1458 |
+
# Get causes
|
| 1459 |
+
entry_causes = form_data.get("causes", [])
|
| 1460 |
+
if isinstance(entry_causes, str):
|
| 1461 |
+
entry_causes = [entry_causes]
|
| 1462 |
+
|
| 1463 |
+
# Filter by causes if specified
|
| 1464 |
+
if cause_list:
|
| 1465 |
+
if not any(c in entry_causes for c in cause_list):
|
| 1466 |
+
continue
|
| 1467 |
+
|
| 1468 |
+
# Get housing type
|
| 1469 |
+
housing_type = form_data.get("housing_type", "hdb_4_room")
|
| 1470 |
+
amount_range = form_data.get("amount_range", "")
|
| 1471 |
+
if not housing_type or housing_type == "unknown":
|
| 1472 |
+
# Infer from amount_range
|
| 1473 |
+
if "10000" in str(amount_range) or "5000" in str(amount_range):
|
| 1474 |
+
housing_type = "landed"
|
| 1475 |
+
elif "1000" in str(amount_range):
|
| 1476 |
+
housing_type = "condo"
|
| 1477 |
+
elif "500" in str(amount_range):
|
| 1478 |
+
housing_type = "hdb_executive"
|
| 1479 |
+
else:
|
| 1480 |
+
housing_type = "hdb_4_room"
|
| 1481 |
+
|
| 1482 |
+
# Get coordinates
|
| 1483 |
+
if planning_area in PLANNING_AREAS:
|
| 1484 |
+
area_info = PLANNING_AREAS[planning_area]
|
| 1485 |
+
lat = area_info["lat"] + (hash(entry_id) % 100 - 50) * 0.0005
|
| 1486 |
+
lng = area_info["lng"] + (hash(entry_id[::-1]) % 100 - 50) * 0.0005
|
| 1487 |
+
else:
|
| 1488 |
+
lat, lng = 1.3521, 103.8198 # Singapore center
|
| 1489 |
+
|
| 1490 |
+
# Aggregate by area
|
| 1491 |
+
if planning_area not in area_stats:
|
| 1492 |
+
area_stats[planning_area] = {
|
| 1493 |
+
"name": PLANNING_AREAS.get(planning_area, {}).get(
|
| 1494 |
+
"name", planning_area.replace("_", " ").title()
|
| 1495 |
+
),
|
| 1496 |
+
"lat": PLANNING_AREAS.get(planning_area, {}).get("lat", 1.3521),
|
| 1497 |
+
"lng": PLANNING_AREAS.get(planning_area, {}).get("lng", 103.8198),
|
| 1498 |
+
"total_count": 0,
|
| 1499 |
+
"donor_count": 0,
|
| 1500 |
+
"client_count": 0,
|
| 1501 |
+
"causes": {},
|
| 1502 |
+
"housing_breakdown": {},
|
| 1503 |
+
"avg_income_proxy": 0,
|
| 1504 |
+
"income_proxies": [],
|
| 1505 |
+
}
|
| 1506 |
+
|
| 1507 |
+
stats = area_stats[planning_area]
|
| 1508 |
+
stats["total_count"] += 1
|
| 1509 |
+
if form_type == "donor":
|
| 1510 |
+
stats["donor_count"] += 1
|
| 1511 |
+
else:
|
| 1512 |
+
stats["client_count"] += 1
|
| 1513 |
+
|
| 1514 |
+
# Count causes
|
| 1515 |
+
for cause in entry_causes:
|
| 1516 |
+
stats["causes"][cause] = stats["causes"].get(cause, 0) + 1
|
| 1517 |
+
|
| 1518 |
+
# Count housing
|
| 1519 |
+
stats["housing_breakdown"][housing_type] = (
|
| 1520 |
+
stats["housing_breakdown"].get(housing_type, 0) + 1
|
| 1521 |
+
)
|
| 1522 |
+
|
| 1523 |
+
# Track income proxy
|
| 1524 |
+
try:
|
| 1525 |
+
income_proxy = HOUSING_INCOME_PROXY.get(HousingType(housing_type), 0.5)
|
| 1526 |
+
except:
|
| 1527 |
+
income_proxy = 0.5
|
| 1528 |
+
stats["income_proxies"].append(income_proxy)
|
| 1529 |
+
|
| 1530 |
+
# Add individual point
|
| 1531 |
+
individual_points.append(
|
| 1532 |
+
{
|
| 1533 |
+
"id": entry_id,
|
| 1534 |
+
"type": form_type,
|
| 1535 |
+
"lat": lat,
|
| 1536 |
+
"lng": lng,
|
| 1537 |
+
"planning_area": planning_area,
|
| 1538 |
+
"housing_type": housing_type,
|
| 1539 |
+
"causes": entry_causes[:5], # Limit for performance
|
| 1540 |
+
"is_donor": form_type == "donor",
|
| 1541 |
+
}
|
| 1542 |
+
)
|
| 1543 |
+
|
| 1544 |
+
# Calculate averages
|
| 1545 |
+
for area, stats in area_stats.items():
|
| 1546 |
+
if stats["income_proxies"]:
|
| 1547 |
+
stats["avg_income_proxy"] = round(
|
| 1548 |
+
sum(stats["income_proxies"]) / len(stats["income_proxies"]), 3
|
| 1549 |
+
)
|
| 1550 |
+
del stats["income_proxies"]
|
| 1551 |
+
|
| 1552 |
+
# Create GeoJSON for areas (polygons would need actual boundary data, using circles)
|
| 1553 |
+
area_geojson = {
|
| 1554 |
+
"type": "FeatureCollection",
|
| 1555 |
+
"features": [
|
| 1556 |
+
{
|
| 1557 |
+
"type": "Feature",
|
| 1558 |
+
"geometry": {
|
| 1559 |
+
"type": "Point",
|
| 1560 |
+
"coordinates": [stats["lng"], stats["lat"]],
|
| 1561 |
+
},
|
| 1562 |
+
"properties": {
|
| 1563 |
+
"planning_area": area,
|
| 1564 |
+
"name": stats["name"],
|
| 1565 |
+
**{k: v for k, v in stats.items() if k not in ["lat", "lng"]},
|
| 1566 |
+
},
|
| 1567 |
+
}
|
| 1568 |
+
for area, stats in area_stats.items()
|
| 1569 |
+
],
|
| 1570 |
+
}
|
| 1571 |
+
|
| 1572 |
+
# Create GeoJSON for individual points
|
| 1573 |
+
points_geojson = {
|
| 1574 |
+
"type": "FeatureCollection",
|
| 1575 |
+
"features": [
|
| 1576 |
+
{
|
| 1577 |
+
"type": "Feature",
|
| 1578 |
+
"geometry": {
|
| 1579 |
+
"type": "Point",
|
| 1580 |
+
"coordinates": [p["lng"], p["lat"]],
|
| 1581 |
+
},
|
| 1582 |
+
"properties": {
|
| 1583 |
+
"id": p["id"],
|
| 1584 |
+
"type": p["type"],
|
| 1585 |
+
"planning_area": p["planning_area"],
|
| 1586 |
+
"housing_type": p["housing_type"],
|
| 1587 |
+
"causes": p["causes"],
|
| 1588 |
+
"is_donor": p["is_donor"],
|
| 1589 |
+
},
|
| 1590 |
+
}
|
| 1591 |
+
for p in individual_points
|
| 1592 |
+
],
|
| 1593 |
+
}
|
| 1594 |
+
|
| 1595 |
+
# Summary statistics
|
| 1596 |
+
all_causes = {}
|
| 1597 |
+
all_housing = {}
|
| 1598 |
+
for stats in area_stats.values():
|
| 1599 |
+
for cause, count in stats["causes"].items():
|
| 1600 |
+
all_causes[cause] = all_causes.get(cause, 0) + count
|
| 1601 |
+
for housing, count in stats["housing_breakdown"].items():
|
| 1602 |
+
all_housing[housing] = all_housing.get(housing, 0) + count
|
| 1603 |
+
|
| 1604 |
+
return {
|
| 1605 |
+
"total_donors": sum(s["donor_count"] for s in area_stats.values()),
|
| 1606 |
+
"total_clients": sum(s["client_count"] for s in area_stats.values()),
|
| 1607 |
+
"areas_with_data": len(area_stats),
|
| 1608 |
+
"summary": {
|
| 1609 |
+
"top_causes": sorted(
|
| 1610 |
+
all_causes.items(), key=lambda x: x[1], reverse=True
|
| 1611 |
+
)[:10],
|
| 1612 |
+
"housing_distribution": all_housing,
|
| 1613 |
+
},
|
| 1614 |
+
"area_aggregates": area_geojson,
|
| 1615 |
+
"individual_points": points_geojson,
|
| 1616 |
+
"planning_areas": PLANNING_AREAS,
|
| 1617 |
+
}
|
| 1618 |
+
|
| 1619 |
+
except Exception as e:
|
| 1620 |
+
import traceback
|
| 1621 |
+
|
| 1622 |
+
traceback.print_exc()
|
| 1623 |
+
return await _generate_mock_map_demographics(causes)
|
| 1624 |
+
|
| 1625 |
+
|
| 1626 |
+
async def _generate_mock_map_demographics(causes: Optional[str] = None):
|
| 1627 |
+
"""Generate mock demographics data for map visualization."""
|
| 1628 |
+
from recommender.gis_recommender import (
|
| 1629 |
+
PLANNING_AREAS,
|
| 1630 |
+
HOUSING_INCOME_PROXY,
|
| 1631 |
+
HousingType,
|
| 1632 |
+
)
|
| 1633 |
+
import random
|
| 1634 |
+
|
| 1635 |
+
cause_list = (
|
| 1636 |
+
causes.split(",")
|
| 1637 |
+
if causes
|
| 1638 |
+
else ["education", "animals", "poverty", "environment", "health"]
|
| 1639 |
+
)
|
| 1640 |
+
|
| 1641 |
+
area_stats = {}
|
| 1642 |
+
individual_points = []
|
| 1643 |
+
|
| 1644 |
+
for area_id, area_info in PLANNING_AREAS.items():
|
| 1645 |
+
count = random.randint(3, 25)
|
| 1646 |
+
donors = random.randint(1, count)
|
| 1647 |
+
|
| 1648 |
+
area_stats[area_id] = {
|
| 1649 |
+
"name": area_info["name"],
|
| 1650 |
+
"lat": area_info["lat"],
|
| 1651 |
+
"lng": area_info["lng"],
|
| 1652 |
+
"total_count": count,
|
| 1653 |
+
"donor_count": donors,
|
| 1654 |
+
"client_count": count - donors,
|
| 1655 |
+
"causes": {
|
| 1656 |
+
cause: random.randint(1, count)
|
| 1657 |
+
for cause in random.sample(cause_list, min(3, len(cause_list)))
|
| 1658 |
+
},
|
| 1659 |
+
"housing_breakdown": {
|
| 1660 |
+
"hdb_4_room": random.randint(0, count // 2),
|
| 1661 |
+
"condo": random.randint(0, count // 3),
|
| 1662 |
+
"landed": random.randint(0, count // 4),
|
| 1663 |
+
},
|
| 1664 |
+
"avg_income_proxy": round(random.uniform(0.3, 0.8), 3),
|
| 1665 |
+
}
|
| 1666 |
+
|
| 1667 |
+
# Generate individual points
|
| 1668 |
+
for i in range(count):
|
| 1669 |
+
lat = area_info["lat"] + (random.random() - 0.5) * 0.02
|
| 1670 |
+
lng = area_info["lng"] + (random.random() - 0.5) * 0.02
|
| 1671 |
+
housing_types = [
|
| 1672 |
+
"hdb_3_room",
|
| 1673 |
+
"hdb_4_room",
|
| 1674 |
+
"hdb_5_room",
|
| 1675 |
+
"hdb_executive",
|
| 1676 |
+
"condo",
|
| 1677 |
+
"landed",
|
| 1678 |
+
]
|
| 1679 |
+
|
| 1680 |
+
individual_points.append(
|
| 1681 |
+
{
|
| 1682 |
+
"id": f"mock_{area_id}_{i}",
|
| 1683 |
+
"type": "donor" if i < donors else "client",
|
| 1684 |
+
"lat": lat,
|
| 1685 |
+
"lng": lng,
|
| 1686 |
+
"planning_area": area_id,
|
| 1687 |
+
"housing_type": random.choice(housing_types),
|
| 1688 |
+
"causes": random.sample(cause_list, min(2, len(cause_list))),
|
| 1689 |
+
"is_donor": i < donors,
|
| 1690 |
+
}
|
| 1691 |
+
)
|
| 1692 |
+
|
| 1693 |
+
# Create GeoJSON
|
| 1694 |
+
area_geojson = {
|
| 1695 |
+
"type": "FeatureCollection",
|
| 1696 |
+
"features": [
|
| 1697 |
+
{
|
| 1698 |
+
"type": "Feature",
|
| 1699 |
+
"geometry": {
|
| 1700 |
+
"type": "Point",
|
| 1701 |
+
"coordinates": [stats["lng"], stats["lat"]],
|
| 1702 |
+
},
|
| 1703 |
+
"properties": {
|
| 1704 |
+
"planning_area": area,
|
| 1705 |
+
"name": stats["name"],
|
| 1706 |
+
**{k: v for k, v in stats.items() if k not in ["lat", "lng"]},
|
| 1707 |
+
},
|
| 1708 |
+
}
|
| 1709 |
+
for area, stats in area_stats.items()
|
| 1710 |
+
],
|
| 1711 |
+
}
|
| 1712 |
+
|
| 1713 |
+
points_geojson = {
|
| 1714 |
+
"type": "FeatureCollection",
|
| 1715 |
+
"features": [
|
| 1716 |
+
{
|
| 1717 |
+
"type": "Feature",
|
| 1718 |
+
"geometry": {"type": "Point", "coordinates": [p["lng"], p["lat"]]},
|
| 1719 |
+
"properties": {k: v for k, v in p.items() if k not in ["lat", "lng"]},
|
| 1720 |
+
}
|
| 1721 |
+
for p in individual_points
|
| 1722 |
+
],
|
| 1723 |
+
}
|
| 1724 |
+
|
| 1725 |
+
return {
|
| 1726 |
+
"total_donors": sum(s["donor_count"] for s in area_stats.values()),
|
| 1727 |
+
"total_clients": sum(s["client_count"] for s in area_stats.values()),
|
| 1728 |
+
"areas_with_data": len(area_stats),
|
| 1729 |
+
"summary": {
|
| 1730 |
+
"top_causes": [(c, random.randint(10, 50)) for c in cause_list[:5]],
|
| 1731 |
+
"housing_distribution": {
|
| 1732 |
+
"hdb_4_room": 120,
|
| 1733 |
+
"condo": 45,
|
| 1734 |
+
"landed": 20,
|
| 1735 |
+
"hdb_5_room": 30,
|
| 1736 |
+
},
|
| 1737 |
+
},
|
| 1738 |
+
"area_aggregates": area_geojson,
|
| 1739 |
+
"individual_points": points_geojson,
|
| 1740 |
+
"planning_areas": PLANNING_AREAS,
|
| 1741 |
+
}
|
| 1742 |
+
|
| 1743 |
+
|
| 1744 |
+
@app.get("/debug/search-donors")
|
| 1745 |
+
async def debug_search_donors(cause: str = "education", limit: int = 10):
|
| 1746 |
+
"""
|
| 1747 |
+
Debug endpoint to directly search for donors in the database.
|
| 1748 |
+
|
| 1749 |
+
This bypasses the GIS recommender to see raw database results.
|
| 1750 |
+
"""
|
| 1751 |
+
if not encoder or not vector_store:
|
| 1752 |
+
return {"error": "Encoder or database not available"}
|
| 1753 |
+
|
| 1754 |
+
try:
|
| 1755 |
+
# Create a simple query embedding
|
| 1756 |
+
query_text = f"Donor interested in {cause} causes, looking to support {cause} initiatives"
|
| 1757 |
+
query_embedding = await encoder.encode(query_text)
|
| 1758 |
+
|
| 1759 |
+
# Search for donors
|
| 1760 |
+
donor_results = await vector_store.find_similar(
|
| 1761 |
+
query_embedding=query_embedding,
|
| 1762 |
+
form_type="donor",
|
| 1763 |
+
limit=limit,
|
| 1764 |
+
)
|
| 1765 |
+
|
| 1766 |
+
# Also search for clients
|
| 1767 |
+
client_results = await vector_store.find_similar(
|
| 1768 |
+
query_embedding=query_embedding,
|
| 1769 |
+
form_type="client",
|
| 1770 |
+
limit=limit,
|
| 1771 |
+
)
|
| 1772 |
+
|
| 1773 |
+
return {
|
| 1774 |
+
"query_cause": cause,
|
| 1775 |
+
"donor_results": [
|
| 1776 |
+
{
|
| 1777 |
+
"id": r.id,
|
| 1778 |
+
"form_type": r.form_type,
|
| 1779 |
+
"score": round(r.score, 4),
|
| 1780 |
+
"distance": round(r.distance, 4),
|
| 1781 |
+
"causes": r.form_data.get("causes", []),
|
| 1782 |
+
"country": r.form_data.get("country"),
|
| 1783 |
+
}
|
| 1784 |
+
for r in donor_results
|
| 1785 |
+
],
|
| 1786 |
+
"client_results": [
|
| 1787 |
+
{
|
| 1788 |
+
"id": r.id,
|
| 1789 |
+
"form_type": r.form_type,
|
| 1790 |
+
"score": round(r.score, 4),
|
| 1791 |
+
"distance": round(r.distance, 4),
|
| 1792 |
+
"causes": r.form_data.get("causes", []),
|
| 1793 |
+
"planning_area": r.form_data.get("planning_area"),
|
| 1794 |
+
}
|
| 1795 |
+
for r in client_results
|
| 1796 |
+
],
|
| 1797 |
+
"total_donors": len(donor_results),
|
| 1798 |
+
"total_clients": len(client_results),
|
| 1799 |
+
}
|
| 1800 |
+
except Exception as e:
|
| 1801 |
+
import traceback
|
| 1802 |
+
|
| 1803 |
+
return {"error": str(e), "traceback": traceback.format_exc()}
|
| 1804 |
+
|
| 1805 |
+
|
| 1806 |
+
# ============================================================================
|
| 1807 |
+
# Main
|
| 1808 |
+
# ============================================================================
|
| 1809 |
+
|
| 1810 |
+
if __name__ == "__main__":
|
| 1811 |
+
import uvicorn
|
| 1812 |
+
|
| 1813 |
+
# Windows-specific fix: must be set before uvicorn starts its event loop
|
| 1814 |
+
if sys.platform == "win32":
|
| 1815 |
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
| 1816 |
+
|
| 1817 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
encoders/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Text encoders for embedding generation."""
|
| 2 |
+
from .base import BaseEncoder
|
| 3 |
+
from .sealion import SeaLionEncoder
|
| 4 |
+
|
| 5 |
+
__all__ = ["BaseEncoder", "SeaLionEncoder"]
|
encoders/base.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base encoder abstract class."""
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from typing import List
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BaseEncoder(ABC):
|
| 9 |
+
"""Base class for text encoders.
|
| 10 |
+
|
| 11 |
+
Provides a common interface for encoding text into vector embeddings.
|
| 12 |
+
Implementations can use different models (SeaLion, OpenAI, etc.).
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
@property
|
| 16 |
+
@abstractmethod
|
| 17 |
+
def embedding_dimension(self) -> int:
|
| 18 |
+
"""Return the native embedding dimension of this encoder."""
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
@abstractmethod
|
| 22 |
+
async def encode(self, text: str) -> np.ndarray:
|
| 23 |
+
"""Encode a single text into a vector.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
text: The text to encode.
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
A numpy array of shape (embedding_dimension,).
|
| 30 |
+
"""
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
async def encode_batch(self, texts: List[str]) -> np.ndarray:
|
| 35 |
+
"""Encode multiple texts into vectors (batch processing).
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
texts: List of texts to encode.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
A numpy array of shape (len(texts), embedding_dimension).
|
| 42 |
+
"""
|
| 43 |
+
pass
|
encoders/sealion.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SeaLion encoder for ASEAN multilingual form analysis.
|
| 2 |
+
|
| 3 |
+
Uses SeaLion chat API to analyze donor/volunteer forms and extract
|
| 4 |
+
structured features for embedding generation. SeaLion is chosen for its
|
| 5 |
+
knowledge of ASEAN nations and multilingual capabilities.
|
| 6 |
+
|
| 7 |
+
API Details:
|
| 8 |
+
- Base URL: Set via SEALION_ENDPOINT environment variable
|
| 9 |
+
- Endpoint: POST /chat
|
| 10 |
+
- Request: {"prompt": "...", "system": "..."}
|
| 11 |
+
- Response: OpenAI-compatible format with choices[0].message.content
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import httpx
|
| 16 |
+
import json
|
| 17 |
+
import hashlib
|
| 18 |
+
import numpy as np
|
| 19 |
+
from typing import List, Optional, Dict, Any
|
| 20 |
+
from .base import BaseEncoder
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Feature categories for encoding (used for vector generation)
|
| 24 |
+
CAUSE_CATEGORIES = [
|
| 25 |
+
"education", "health", "environment", "poverty", "children",
|
| 26 |
+
"elderly", "disability", "animals", "arts", "sports",
|
| 27 |
+
"disaster_relief", "human_rights", "technology", "agriculture", "housing"
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
ASEAN_COUNTRIES = ["SG", "MY", "TH", "VN", "ID", "PH", "MM", "KH", "LA", "BN"]
|
| 31 |
+
|
| 32 |
+
LANGUAGES = ["en", "ms", "th", "vi", "id", "tl", "my", "km", "lo", "zh"]
|
| 33 |
+
|
| 34 |
+
AVAILABILITY_TYPES = ["weekends", "evenings", "flexible", "full_time", "event_based"]
|
| 35 |
+
|
| 36 |
+
DONOR_TYPES = ["individual", "corporate", "foundation"]
|
| 37 |
+
|
| 38 |
+
VOLUNTEER_TYPES = ["regular", "event_based", "skilled"]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class SeaLionEncoder(BaseEncoder):
|
| 42 |
+
"""SeaLion encoder using chat API for form analysis.
|
| 43 |
+
|
| 44 |
+
Uses SeaLion's ASEAN knowledge and multilingual capabilities to:
|
| 45 |
+
1. Analyze form content semantically
|
| 46 |
+
2. Extract structured features
|
| 47 |
+
3. Generate embeddings suitable for similarity matching
|
| 48 |
+
|
| 49 |
+
The encoder combines:
|
| 50 |
+
- Feature extraction via SeaLion chat API
|
| 51 |
+
- Deterministic feature hashing for categorical data
|
| 52 |
+
- Semantic scoring from LLM analysis
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
# Fixed embedding dimension (matches Supabase EMBED_DIMENSION)
|
| 56 |
+
_feature_dimension: int = 1024
|
| 57 |
+
|
| 58 |
+
def __init__(
|
| 59 |
+
self,
|
| 60 |
+
endpoint_url: Optional[str] = None,
|
| 61 |
+
timeout: float = 60.0,
|
| 62 |
+
max_retries: int = 3
|
| 63 |
+
):
|
| 64 |
+
"""Initialize SeaLion encoder.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
endpoint_url: The SeaLion API base URL. If not provided,
|
| 68 |
+
reads from SEALION_ENDPOINT environment variable.
|
| 69 |
+
timeout: Request timeout in seconds.
|
| 70 |
+
max_retries: Maximum number of retry attempts on failure.
|
| 71 |
+
"""
|
| 72 |
+
url = endpoint_url or os.getenv("SEALION_ENDPOINT")
|
| 73 |
+
if not url:
|
| 74 |
+
raise ValueError("SEALION_ENDPOINT environment variable is required")
|
| 75 |
+
self.endpoint_url = url.rstrip("/")
|
| 76 |
+
self.timeout = timeout
|
| 77 |
+
self.max_retries = max_retries
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def embedding_dimension(self) -> int:
|
| 81 |
+
"""Return the embedding dimension (fixed at 1024)."""
|
| 82 |
+
return self._feature_dimension
|
| 83 |
+
|
| 84 |
+
def _build_system_prompt(self) -> str:
|
| 85 |
+
"""Build system prompt for SeaLion analysis."""
|
| 86 |
+
return """You are an ASEAN donor/volunteer profile analyzer. Your task is to analyze form data and extract structured features for matching.
|
| 87 |
+
|
| 88 |
+
Analyze the provided form and respond with a JSON object containing these fields:
|
| 89 |
+
|
| 90 |
+
1. "causes": List of relevant cause categories from: education, health, environment, poverty, children, elderly, disability, animals, arts, sports, disaster_relief, human_rights, technology, agriculture, housing
|
| 91 |
+
|
| 92 |
+
2. "cause_scores": Object with scores (0.0-1.0) for each relevant cause based on text sentiment and context
|
| 93 |
+
|
| 94 |
+
3. "engagement_level": Score from 0.0 to 1.0 indicating commitment level (based on frequency, bio, motivation)
|
| 95 |
+
|
| 96 |
+
4. "experience_level": Score from 0.0 to 1.0 indicating prior experience
|
| 97 |
+
|
| 98 |
+
5. "financial_capacity": Score from 0.0 to 1.0 for donors (based on amount range, donor type)
|
| 99 |
+
|
| 100 |
+
6. "skills_diversity": Score from 0.0 to 1.0 for volunteers (based on skills listed)
|
| 101 |
+
|
| 102 |
+
7. "language_diversity": Score from 0.0 to 1.0 based on languages spoken
|
| 103 |
+
|
| 104 |
+
8. "motivation_themes": List of key themes extracted from bio/motivation/goals text
|
| 105 |
+
|
| 106 |
+
9. "regional_focus": Score from 0.0 to 1.0 indicating focus on ASEAN vs global causes
|
| 107 |
+
|
| 108 |
+
Respond ONLY with valid JSON, no explanation."""
|
| 109 |
+
|
| 110 |
+
async def _call_sealion(self, prompt: str) -> str:
|
| 111 |
+
"""Call SeaLion chat API.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
prompt: The user prompt to send.
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
The response text from SeaLion.
|
| 118 |
+
|
| 119 |
+
Raises:
|
| 120 |
+
httpx.HTTPStatusError: If request fails after retries.
|
| 121 |
+
"""
|
| 122 |
+
last_error = None
|
| 123 |
+
|
| 124 |
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
| 125 |
+
for attempt in range(self.max_retries):
|
| 126 |
+
try:
|
| 127 |
+
response = await client.post(
|
| 128 |
+
f"{self.endpoint_url}/chat",
|
| 129 |
+
headers={"Content-Type": "application/json"},
|
| 130 |
+
json={
|
| 131 |
+
"prompt": prompt,
|
| 132 |
+
"system": self._build_system_prompt()
|
| 133 |
+
}
|
| 134 |
+
)
|
| 135 |
+
response.raise_for_status()
|
| 136 |
+
data = response.json()
|
| 137 |
+
# Handle OpenAI-compatible format (choices array)
|
| 138 |
+
if 'choices' in data and len(data['choices']) > 0:
|
| 139 |
+
choice = data['choices'][0]
|
| 140 |
+
if 'message' in choice:
|
| 141 |
+
return choice['message'].get('content', '')
|
| 142 |
+
if 'text' in choice:
|
| 143 |
+
return choice['text']
|
| 144 |
+
# Fallback to simple format
|
| 145 |
+
return data.get("response", "")
|
| 146 |
+
except httpx.HTTPStatusError as e:
|
| 147 |
+
last_error = e
|
| 148 |
+
if e.response.status_code >= 500:
|
| 149 |
+
continue
|
| 150 |
+
raise
|
| 151 |
+
except httpx.RequestError as e:
|
| 152 |
+
last_error = e
|
| 153 |
+
continue
|
| 154 |
+
|
| 155 |
+
if last_error:
|
| 156 |
+
raise last_error
|
| 157 |
+
raise RuntimeError("SeaLion API call failed")
|
| 158 |
+
|
| 159 |
+
def _parse_sealion_response(self, response: str) -> Dict[str, Any]:
|
| 160 |
+
"""Parse SeaLion JSON response.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
response: Raw response text from SeaLion.
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
Parsed JSON dictionary, or empty dict if parsing fails.
|
| 167 |
+
"""
|
| 168 |
+
try:
|
| 169 |
+
# Try to extract JSON from response (may have extra text)
|
| 170 |
+
start = response.find("{")
|
| 171 |
+
end = response.rfind("}") + 1
|
| 172 |
+
if start >= 0 and end > start:
|
| 173 |
+
json_str = response[start:end]
|
| 174 |
+
return json.loads(json_str)
|
| 175 |
+
except json.JSONDecodeError:
|
| 176 |
+
pass
|
| 177 |
+
return {}
|
| 178 |
+
|
| 179 |
+
def _hash_to_vector(self, text: str, dimension: int, offset: int = 0) -> np.ndarray:
|
| 180 |
+
"""Convert text to a deterministic vector using hashing.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
text: Text to hash.
|
| 184 |
+
dimension: Output vector dimension.
|
| 185 |
+
offset: Offset into the full embedding space.
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
A sparse vector contribution.
|
| 189 |
+
"""
|
| 190 |
+
vector = np.zeros(self._feature_dimension, dtype=np.float32)
|
| 191 |
+
if not text:
|
| 192 |
+
return vector
|
| 193 |
+
|
| 194 |
+
# Use SHA256 for deterministic hashing
|
| 195 |
+
hash_bytes = hashlib.sha256(text.lower().encode()).digest()
|
| 196 |
+
|
| 197 |
+
# Convert to indices and values
|
| 198 |
+
for i in range(0, min(len(hash_bytes), dimension), 2):
|
| 199 |
+
idx = (hash_bytes[i] + offset) % self._feature_dimension
|
| 200 |
+
val = (hash_bytes[i + 1] / 255.0) * 2 - 1 # Normalize to [-1, 1]
|
| 201 |
+
vector[idx] += val
|
| 202 |
+
|
| 203 |
+
return vector
|
| 204 |
+
|
| 205 |
+
def _encode_categorical(
|
| 206 |
+
self,
|
| 207 |
+
value: str,
|
| 208 |
+
categories: List[str],
|
| 209 |
+
start_idx: int
|
| 210 |
+
) -> np.ndarray:
|
| 211 |
+
"""One-hot encode a categorical value.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
value: The value to encode.
|
| 215 |
+
categories: List of possible categories.
|
| 216 |
+
start_idx: Starting index in the embedding vector.
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Embedding contribution from this categorical.
|
| 220 |
+
"""
|
| 221 |
+
vector = np.zeros(self._feature_dimension, dtype=np.float32)
|
| 222 |
+
value_lower = value.lower() if value else ""
|
| 223 |
+
|
| 224 |
+
for i, cat in enumerate(categories):
|
| 225 |
+
if cat.lower() in value_lower or value_lower in cat.lower():
|
| 226 |
+
idx = (start_idx + i) % self._feature_dimension
|
| 227 |
+
vector[idx] = 1.0
|
| 228 |
+
break
|
| 229 |
+
|
| 230 |
+
return vector
|
| 231 |
+
|
| 232 |
+
def _encode_multi_categorical(
|
| 233 |
+
self,
|
| 234 |
+
values: List[str],
|
| 235 |
+
categories: List[str],
|
| 236 |
+
start_idx: int
|
| 237 |
+
) -> np.ndarray:
|
| 238 |
+
"""Multi-hot encode a list of categorical values.
|
| 239 |
+
|
| 240 |
+
Args:
|
| 241 |
+
values: List of values to encode.
|
| 242 |
+
categories: List of possible categories.
|
| 243 |
+
start_idx: Starting index in the embedding vector.
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
Embedding contribution from these categoricals.
|
| 247 |
+
"""
|
| 248 |
+
vector = np.zeros(self._feature_dimension, dtype=np.float32)
|
| 249 |
+
values_lower = [v.lower() for v in values] if values else []
|
| 250 |
+
|
| 251 |
+
for i, cat in enumerate(categories):
|
| 252 |
+
cat_lower = cat.lower()
|
| 253 |
+
for val in values_lower:
|
| 254 |
+
if cat_lower in val or val in cat_lower:
|
| 255 |
+
idx = (start_idx + i) % self._feature_dimension
|
| 256 |
+
vector[idx] = 1.0
|
| 257 |
+
break
|
| 258 |
+
|
| 259 |
+
return vector
|
| 260 |
+
|
| 261 |
+
def _build_embedding_from_features(
|
| 262 |
+
self,
|
| 263 |
+
form_text: str,
|
| 264 |
+
features: Dict[str, Any]
|
| 265 |
+
) -> np.ndarray:
|
| 266 |
+
"""Build final embedding from extracted features.
|
| 267 |
+
|
| 268 |
+
Combines:
|
| 269 |
+
- Deterministic text hashing (semantic coverage)
|
| 270 |
+
- One-hot/multi-hot categorical encoding
|
| 271 |
+
- Continuous scores from SeaLion analysis
|
| 272 |
+
|
| 273 |
+
Args:
|
| 274 |
+
form_text: Original form text for hashing.
|
| 275 |
+
features: Extracted features from SeaLion.
|
| 276 |
+
|
| 277 |
+
Returns:
|
| 278 |
+
Final embedding vector of shape (1024,).
|
| 279 |
+
"""
|
| 280 |
+
embedding = np.zeros(self._feature_dimension, dtype=np.float32)
|
| 281 |
+
|
| 282 |
+
# Section 1 (indices 0-255): Text hash for semantic similarity
|
| 283 |
+
embedding += self._hash_to_vector(form_text, 256, offset=0)
|
| 284 |
+
|
| 285 |
+
# Section 2 (indices 256-511): Cause categories
|
| 286 |
+
causes = features.get("causes", [])
|
| 287 |
+
embedding += self._encode_multi_categorical(causes, CAUSE_CATEGORIES, 256)
|
| 288 |
+
|
| 289 |
+
# Section 3 (indices 512-527): Cause scores
|
| 290 |
+
cause_scores = features.get("cause_scores", {})
|
| 291 |
+
for i, cause in enumerate(CAUSE_CATEGORIES):
|
| 292 |
+
idx = 512 + i
|
| 293 |
+
if cause in cause_scores:
|
| 294 |
+
embedding[idx] = float(cause_scores[cause])
|
| 295 |
+
|
| 296 |
+
# Section 4 (indices 528-537): Country encoding
|
| 297 |
+
# Extract country from form text
|
| 298 |
+
for i, country in enumerate(ASEAN_COUNTRIES):
|
| 299 |
+
if country.lower() in form_text.lower():
|
| 300 |
+
embedding[528 + i] = 1.0
|
| 301 |
+
|
| 302 |
+
# Section 5 (indices 538-547): Language encoding
|
| 303 |
+
for i, lang in enumerate(LANGUAGES):
|
| 304 |
+
if lang.lower() in form_text.lower():
|
| 305 |
+
embedding[538 + i] = 1.0
|
| 306 |
+
|
| 307 |
+
# Section 6 (indices 548-557): Continuous scores
|
| 308 |
+
embedding[548] = features.get("engagement_level", 0.5)
|
| 309 |
+
embedding[549] = features.get("experience_level", 0.5)
|
| 310 |
+
embedding[550] = features.get("financial_capacity", 0.5)
|
| 311 |
+
embedding[551] = features.get("skills_diversity", 0.5)
|
| 312 |
+
embedding[552] = features.get("language_diversity", 0.5)
|
| 313 |
+
embedding[553] = features.get("regional_focus", 0.5)
|
| 314 |
+
|
| 315 |
+
# Section 7 (indices 558-600): Donor/volunteer type encoding
|
| 316 |
+
embedding += self._encode_categorical(
|
| 317 |
+
form_text, DONOR_TYPES, 558
|
| 318 |
+
)
|
| 319 |
+
embedding += self._encode_categorical(
|
| 320 |
+
form_text, VOLUNTEER_TYPES, 563
|
| 321 |
+
)
|
| 322 |
+
embedding += self._encode_categorical(
|
| 323 |
+
form_text, AVAILABILITY_TYPES, 568
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
# Section 8 (indices 600-1023): Motivation themes hash
|
| 327 |
+
themes = features.get("motivation_themes", [])
|
| 328 |
+
if themes:
|
| 329 |
+
themes_text = " ".join(themes)
|
| 330 |
+
embedding += self._hash_to_vector(themes_text, 424, offset=600)
|
| 331 |
+
|
| 332 |
+
# Normalize the embedding
|
| 333 |
+
norm = np.linalg.norm(embedding)
|
| 334 |
+
if norm > 0:
|
| 335 |
+
embedding = embedding / norm
|
| 336 |
+
|
| 337 |
+
return embedding
|
| 338 |
+
|
| 339 |
+
async def encode(self, text: str) -> np.ndarray:
|
| 340 |
+
"""Encode form text using SeaLion analysis.
|
| 341 |
+
|
| 342 |
+
Process:
|
| 343 |
+
1. Send form text to SeaLion for semantic analysis
|
| 344 |
+
2. Parse extracted features from response
|
| 345 |
+
3. Build embedding from features + text hashing
|
| 346 |
+
|
| 347 |
+
Args:
|
| 348 |
+
text: The form text to encode.
|
| 349 |
+
|
| 350 |
+
Returns:
|
| 351 |
+
A numpy array of shape (1024,).
|
| 352 |
+
"""
|
| 353 |
+
# Get SeaLion analysis
|
| 354 |
+
response = await self._call_sealion(
|
| 355 |
+
f"Analyze this donor/volunteer form:\n\n{text}"
|
| 356 |
+
)
|
| 357 |
+
features = self._parse_sealion_response(response)
|
| 358 |
+
|
| 359 |
+
# Build embedding from features
|
| 360 |
+
return self._build_embedding_from_features(text, features)
|
| 361 |
+
|
| 362 |
+
async def encode_batch(self, texts: List[str]) -> np.ndarray:
|
| 363 |
+
"""Encode multiple form texts.
|
| 364 |
+
|
| 365 |
+
Note: This makes sequential API calls since the SeaLion API
|
| 366 |
+
doesn't support batch requests.
|
| 367 |
+
|
| 368 |
+
Args:
|
| 369 |
+
texts: List of form texts to encode.
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
A numpy array of shape (len(texts), 1024).
|
| 373 |
+
"""
|
| 374 |
+
if not texts:
|
| 375 |
+
return np.zeros((0, self._feature_dimension), dtype=np.float32)
|
| 376 |
+
|
| 377 |
+
embeddings = []
|
| 378 |
+
for text in texts:
|
| 379 |
+
emb = await self.encode(text)
|
| 380 |
+
embeddings.append(emb)
|
| 381 |
+
|
| 382 |
+
return np.vstack(embeddings)
|
graph/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LangGraph chat graph components."""
|
| 2 |
+
from .builder import build_graph_with_memory
|
| 3 |
+
from .state import State
|
| 4 |
+
from .router import router
|
| 5 |
+
|
| 6 |
+
__all__ = ["build_graph_with_memory", "State", "router"]
|
graph/builder.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langgraph.graph import StateGraph, START, END
|
| 3 |
+
from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver
|
| 4 |
+
from langgraph.store.postgres.aio import AsyncPostgresStore
|
| 5 |
+
from psycopg_pool import AsyncConnectionPool
|
| 6 |
+
from langchain_ollama import ChatOllama
|
| 7 |
+
|
| 8 |
+
from .state import State
|
| 9 |
+
from .router import router
|
| 10 |
+
from agents.therapist import TherapistAgent
|
| 11 |
+
from agents.logical import LogicalAgent
|
| 12 |
+
from agents.classifier import create_classifier
|
| 13 |
+
from agents.charity_search import CharitySearchAgent
|
| 14 |
+
from agents.agentic_rag import AgenticRAGAgent
|
| 15 |
+
from encoders.sealion import SeaLionEncoder
|
| 16 |
+
from recommender.vector_store import DonorVectorStore
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def create_connection_string() -> str:
|
| 20 |
+
"""Build PostgreSQL connection string from environment variables."""
|
| 21 |
+
db_host = os.getenv("SUPABASE_DB_HOST", "localhost")
|
| 22 |
+
db_port = os.getenv("SUPABASE_DB_PORT", "6543")
|
| 23 |
+
db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
|
| 24 |
+
db_user = os.getenv("SUPABASE_DB_USER", "postgres")
|
| 25 |
+
db_password = os.getenv("SUPABASE_DB_PASSWORD", "")
|
| 26 |
+
db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
|
| 27 |
+
|
| 28 |
+
return (
|
| 29 |
+
f"postgres://{db_user}:{db_password}"
|
| 30 |
+
f"@{db_host}:{db_port}/{db_name}"
|
| 31 |
+
f"?sslmode={db_sslmode}"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def create_async_pool() -> AsyncConnectionPool:
|
| 36 |
+
"""Create AsyncConnectionPool with proper settings."""
|
| 37 |
+
return AsyncConnectionPool(
|
| 38 |
+
conninfo=create_connection_string(),
|
| 39 |
+
max_size=20,
|
| 40 |
+
kwargs={
|
| 41 |
+
"autocommit": True,
|
| 42 |
+
"prepare_threshold": None,
|
| 43 |
+
}
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
async def build_graph_with_memory():
|
| 48 |
+
"""Build the graph with Supabase-backed checkpointer and store."""
|
| 49 |
+
|
| 50 |
+
# Create async connection pool
|
| 51 |
+
pool = create_async_pool()
|
| 52 |
+
await pool.open()
|
| 53 |
+
|
| 54 |
+
# Create checkpointer and store from the pool
|
| 55 |
+
checkpointer = AsyncPostgresSaver(pool)
|
| 56 |
+
store = AsyncPostgresStore(pool)
|
| 57 |
+
|
| 58 |
+
# Setup tables for store and checkpointer
|
| 59 |
+
print("\n[Setup] Setting up LangGraph store and checkpointer tables...")
|
| 60 |
+
await checkpointer.setup()
|
| 61 |
+
await store.setup()
|
| 62 |
+
print("[OK] Store and checkpointer tables created!\n")
|
| 63 |
+
|
| 64 |
+
# Use Ollama cloud with API key authentication
|
| 65 |
+
api_key = os.getenv('OLLAMA_API_KEY')
|
| 66 |
+
if api_key:
|
| 67 |
+
llm = ChatOllama(
|
| 68 |
+
model="gpt-oss:120b",
|
| 69 |
+
base_url="https://ollama.com",
|
| 70 |
+
client_kwargs={
|
| 71 |
+
"headers": {"Authorization": f"Bearer {api_key}"}
|
| 72 |
+
}
|
| 73 |
+
)
|
| 74 |
+
else:
|
| 75 |
+
# Fallback to local Ollama if no API key
|
| 76 |
+
llm = ChatOllama(model="gpt-oss:120b-cloud")
|
| 77 |
+
|
| 78 |
+
# Initialize encoder and vector store for Agentic RAG
|
| 79 |
+
encoder = None
|
| 80 |
+
vector_store = None
|
| 81 |
+
try:
|
| 82 |
+
sealion_endpoint = os.getenv("SEALION_ENDPOINT")
|
| 83 |
+
if sealion_endpoint:
|
| 84 |
+
encoder = SeaLionEncoder(endpoint_url=sealion_endpoint)
|
| 85 |
+
vector_store = DonorVectorStore(pool)
|
| 86 |
+
print("[OK] Agentic RAG initialized with SeaLion encoder\n")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"[WARN] Agentic RAG not available: {e}\n")
|
| 89 |
+
|
| 90 |
+
# Create Agentic RAG agent
|
| 91 |
+
agentic_rag_agent = AgenticRAGAgent(llm, encoder, vector_store)
|
| 92 |
+
|
| 93 |
+
# Build the graph
|
| 94 |
+
graph_builder = StateGraph(State)
|
| 95 |
+
graph_builder.add_node("classifier", create_classifier(llm))
|
| 96 |
+
graph_builder.add_node("therapist", TherapistAgent(llm))
|
| 97 |
+
graph_builder.add_node("logical", LogicalAgent(llm))
|
| 98 |
+
graph_builder.add_node("charity_search", CharitySearchAgent(llm))
|
| 99 |
+
graph_builder.add_node("agentic_rag", agentic_rag_agent)
|
| 100 |
+
|
| 101 |
+
graph_builder.add_edge(START, "classifier")
|
| 102 |
+
graph_builder.add_conditional_edges(
|
| 103 |
+
"classifier",
|
| 104 |
+
router,
|
| 105 |
+
{
|
| 106 |
+
"therapist": "therapist",
|
| 107 |
+
"logical": "logical",
|
| 108 |
+
"charity_search": "charity_search",
|
| 109 |
+
"agentic_rag": "agentic_rag"
|
| 110 |
+
}
|
| 111 |
+
)
|
| 112 |
+
graph_builder.add_edge("therapist", END)
|
| 113 |
+
graph_builder.add_edge("logical", END)
|
| 114 |
+
graph_builder.add_edge("charity_search", END)
|
| 115 |
+
graph_builder.add_edge("agentic_rag", END)
|
| 116 |
+
|
| 117 |
+
# Compile with store and checkpointer
|
| 118 |
+
graph = graph_builder.compile(
|
| 119 |
+
checkpointer=checkpointer,
|
| 120 |
+
store=store,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return graph, store, checkpointer
|
graph/router.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .state import State
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def router(state: State):
|
| 5 |
+
"""Route to appropriate agent based on message type."""
|
| 6 |
+
message_type = state.get("message_type", "logical")
|
| 7 |
+
if message_type == "emotional":
|
| 8 |
+
return "therapist"
|
| 9 |
+
elif message_type == "charity_search":
|
| 10 |
+
return "charity_search"
|
| 11 |
+
elif message_type == "donor_search":
|
| 12 |
+
return "agentic_rag"
|
| 13 |
+
elif message_type == "volunteer_search":
|
| 14 |
+
return "agentic_rag"
|
| 15 |
+
return "logical"
|
graph/state.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing_extensions import TypedDict
|
| 2 |
+
from typing import Annotated
|
| 3 |
+
from langgraph.graph.message import add_messages
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class State(TypedDict):
|
| 7 |
+
messages: Annotated[list, add_messages]
|
| 8 |
+
message_type: str | None
|
recommender/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Recommender system components."""
|
| 2 |
+
from .vector_store import DonorVectorStore, SimilarityResult
|
| 3 |
+
from .gis_recommender import (
|
| 4 |
+
GISRecommender,
|
| 5 |
+
ClientProfile,
|
| 6 |
+
ScoredClient,
|
| 7 |
+
HousingType,
|
| 8 |
+
PLANNING_AREAS,
|
| 9 |
+
HOUSING_INCOME_PROXY,
|
| 10 |
+
generate_mock_clients,
|
| 11 |
+
generate_seed_donor_profile,
|
| 12 |
+
EmbeddingReducer,
|
| 13 |
+
HybridSemanticSpatialEncoder,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"DonorVectorStore",
|
| 18 |
+
"SimilarityResult",
|
| 19 |
+
"GISRecommender",
|
| 20 |
+
"ClientProfile",
|
| 21 |
+
"ScoredClient",
|
| 22 |
+
"HousingType",
|
| 23 |
+
"PLANNING_AREAS",
|
| 24 |
+
"HOUSING_INCOME_PROXY",
|
| 25 |
+
"generate_mock_clients",
|
| 26 |
+
"generate_seed_donor_profile",
|
| 27 |
+
"EmbeddingReducer",
|
| 28 |
+
"HybridSemanticSpatialEncoder",
|
| 29 |
+
]
|
recommender/gis_recommender.py
ADDED
|
@@ -0,0 +1,1202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GIS-based Donor/Client Recommender System for ASEAN targeting.
|
| 3 |
+
|
| 4 |
+
This module implements:
|
| 5 |
+
1. Lookalike Retrieval: Find top-K nearest neighbors using cosine similarity
|
| 6 |
+
2. Spatial Filtering: Geo-fence filtering by Singapore planning areas
|
| 7 |
+
3. Tiered Targeting: Ranking based on vector similarity, spatial proxy, and donation history
|
| 8 |
+
4. GeoJSON Export: Output for map-based dashboard visualization
|
| 9 |
+
5. Dimensionality Reduction: PCA for compact semantic representation
|
| 10 |
+
|
| 11 |
+
Privacy Note:
|
| 12 |
+
- PII (names, exact addresses) are stored as encrypted metadata, NOT in the vector
|
| 13 |
+
- Coordinates are stored with reduced precision (3 decimal places ~100m accuracy)
|
| 14 |
+
- Only behavioral/interest data is embedded in the vector space
|
| 15 |
+
|
| 16 |
+
Dimensionality Reduction Strategy:
|
| 17 |
+
- Store BOTH full 1024-dim embedding AND reduced representation
|
| 18 |
+
- Reduced dimensions (2D/3D) enable:
|
| 19 |
+
1. Better matching with small datasets (less noise)
|
| 20 |
+
2. Combination with geo-coordinates for hybrid semantic-spatial search
|
| 21 |
+
3. Visualization in 2D/3D space
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import json
|
| 25 |
+
import hashlib
|
| 26 |
+
from typing import List, Optional, Dict, Any, Tuple, Union
|
| 27 |
+
from dataclasses import dataclass, field, asdict
|
| 28 |
+
from enum import Enum
|
| 29 |
+
import numpy as np
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ============================================================================
|
| 33 |
+
# Dimensionality Reduction Utilities
|
| 34 |
+
# ============================================================================
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class EmbeddingReducer:
|
| 38 |
+
"""
|
| 39 |
+
Reduces high-dimensional embeddings to lower dimensions using PCA.
|
| 40 |
+
|
| 41 |
+
For small datasets, this helps:
|
| 42 |
+
1. Remove noise from sparse dimensions
|
| 43 |
+
2. Enable combination with geo-coordinates
|
| 44 |
+
3. Improve similarity matching with limited data
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, n_components: int = 8):
|
| 48 |
+
"""
|
| 49 |
+
Initialize reducer.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
n_components: Target dimensionality (default 8 for semantic space)
|
| 53 |
+
"""
|
| 54 |
+
self.n_components = n_components
|
| 55 |
+
self._mean = None
|
| 56 |
+
self._components = None
|
| 57 |
+
self._is_fitted = False
|
| 58 |
+
|
| 59 |
+
def fit(self, embeddings: np.ndarray) -> "EmbeddingReducer":
|
| 60 |
+
"""
|
| 61 |
+
Fit PCA on a set of embeddings.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
embeddings: (N, D) array of embeddings
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
self for chaining
|
| 68 |
+
"""
|
| 69 |
+
if embeddings.shape[0] < 2:
|
| 70 |
+
# Not enough data to fit PCA, use identity-like projection
|
| 71 |
+
self._mean = np.zeros(embeddings.shape[1])
|
| 72 |
+
# Select top dimensions with highest variance as proxy
|
| 73 |
+
self._components = np.eye(embeddings.shape[1])[: self.n_components]
|
| 74 |
+
self._is_fitted = True
|
| 75 |
+
return self
|
| 76 |
+
|
| 77 |
+
# Center the data
|
| 78 |
+
self._mean = np.mean(embeddings, axis=0)
|
| 79 |
+
centered = embeddings - self._mean
|
| 80 |
+
|
| 81 |
+
# Simple PCA via SVD (works for small datasets)
|
| 82 |
+
try:
|
| 83 |
+
U, S, Vt = np.linalg.svd(centered, full_matrices=False)
|
| 84 |
+
self._components = Vt[: self.n_components]
|
| 85 |
+
except np.linalg.LinAlgError:
|
| 86 |
+
# SVD failed, use top-variance dimensions
|
| 87 |
+
variances = np.var(centered, axis=0)
|
| 88 |
+
top_dims = np.argsort(variances)[-self.n_components :]
|
| 89 |
+
self._components = np.eye(embeddings.shape[1])[top_dims]
|
| 90 |
+
|
| 91 |
+
self._is_fitted = True
|
| 92 |
+
return self
|
| 93 |
+
|
| 94 |
+
def transform(self, embeddings: np.ndarray) -> np.ndarray:
|
| 95 |
+
"""
|
| 96 |
+
Transform embeddings to reduced dimensionality.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
embeddings: (N, D) or (D,) array of embeddings
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
(N, n_components) or (n_components,) reduced embeddings
|
| 103 |
+
"""
|
| 104 |
+
if not self._is_fitted:
|
| 105 |
+
# Auto-fit on this data if not fitted
|
| 106 |
+
if embeddings.ndim == 1:
|
| 107 |
+
embeddings = embeddings.reshape(1, -1)
|
| 108 |
+
self.fit(embeddings)
|
| 109 |
+
|
| 110 |
+
single = embeddings.ndim == 1
|
| 111 |
+
if single:
|
| 112 |
+
embeddings = embeddings.reshape(1, -1)
|
| 113 |
+
|
| 114 |
+
centered = embeddings - self._mean
|
| 115 |
+
reduced = centered @ self._components.T
|
| 116 |
+
|
| 117 |
+
# Normalize to unit length for cosine similarity
|
| 118 |
+
norms = np.linalg.norm(reduced, axis=1, keepdims=True)
|
| 119 |
+
norms = np.where(norms > 0, norms, 1)
|
| 120 |
+
reduced = reduced / norms
|
| 121 |
+
|
| 122 |
+
return reduced[0] if single else reduced
|
| 123 |
+
|
| 124 |
+
def fit_transform(self, embeddings: np.ndarray) -> np.ndarray:
|
| 125 |
+
"""Fit and transform in one step."""
|
| 126 |
+
return self.fit(embeddings).transform(embeddings)
|
| 127 |
+
|
| 128 |
+
@staticmethod
|
| 129 |
+
def compute_sparse_projection(
|
| 130 |
+
embedding: np.ndarray, n_components: int = 8
|
| 131 |
+
) -> np.ndarray:
|
| 132 |
+
"""
|
| 133 |
+
Fast projection for sparse embeddings without fitting.
|
| 134 |
+
|
| 135 |
+
Selects the top-k dimensions with highest absolute values.
|
| 136 |
+
Good for single queries when no training data available.
|
| 137 |
+
"""
|
| 138 |
+
# Find non-zero dimensions
|
| 139 |
+
nonzero_mask = np.abs(embedding) > 1e-6
|
| 140 |
+
nonzero_indices = np.where(nonzero_mask)[0]
|
| 141 |
+
|
| 142 |
+
if len(nonzero_indices) <= n_components:
|
| 143 |
+
# Few enough non-zero dims, use them directly
|
| 144 |
+
result = np.zeros(n_components)
|
| 145 |
+
result[: len(nonzero_indices)] = embedding[nonzero_indices]
|
| 146 |
+
else:
|
| 147 |
+
# Take top-k by absolute value
|
| 148 |
+
top_k_in_nonzero = np.argsort(np.abs(embedding[nonzero_indices]))[
|
| 149 |
+
-n_components:
|
| 150 |
+
]
|
| 151 |
+
top_k_indices = nonzero_indices[top_k_in_nonzero]
|
| 152 |
+
result = embedding[top_k_indices]
|
| 153 |
+
|
| 154 |
+
# Normalize
|
| 155 |
+
norm = np.linalg.norm(result)
|
| 156 |
+
if norm > 0:
|
| 157 |
+
result = result / norm
|
| 158 |
+
|
| 159 |
+
return result
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
class HybridSemanticSpatialEncoder:
|
| 163 |
+
"""
|
| 164 |
+
Combines semantic embeddings with geographic coordinates.
|
| 165 |
+
|
| 166 |
+
Creates a hybrid vector that captures both:
|
| 167 |
+
1. Semantic similarity (interests, causes)
|
| 168 |
+
2. Spatial proximity (location)
|
| 169 |
+
|
| 170 |
+
This enables "find people with similar interests NEAR this location"
|
| 171 |
+
without strict geo-fencing.
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
def __init__(
|
| 175 |
+
self,
|
| 176 |
+
semantic_dims: int = 8,
|
| 177 |
+
spatial_weight: float = 0.3,
|
| 178 |
+
semantic_weight: float = 0.7,
|
| 179 |
+
):
|
| 180 |
+
"""
|
| 181 |
+
Initialize hybrid encoder.
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
semantic_dims: Reduced semantic dimensions
|
| 185 |
+
spatial_weight: Weight for spatial component (0-1)
|
| 186 |
+
semantic_weight: Weight for semantic component (0-1)
|
| 187 |
+
"""
|
| 188 |
+
self.semantic_dims = semantic_dims
|
| 189 |
+
self.spatial_weight = spatial_weight
|
| 190 |
+
self.semantic_weight = semantic_weight
|
| 191 |
+
self.reducer = EmbeddingReducer(n_components=semantic_dims)
|
| 192 |
+
|
| 193 |
+
# Singapore bounding box for normalization
|
| 194 |
+
self.lat_min, self.lat_max = 1.15, 1.47 # ~35km range
|
| 195 |
+
self.lng_min, self.lng_max = 103.6, 104.1 # ~55km range
|
| 196 |
+
|
| 197 |
+
def normalize_coordinates(self, lat: float, lng: float) -> Tuple[float, float]:
|
| 198 |
+
"""Normalize coordinates to [0, 1] range within Singapore."""
|
| 199 |
+
norm_lat = (lat - self.lat_min) / (self.lat_max - self.lat_min)
|
| 200 |
+
norm_lng = (lng - self.lng_min) / (self.lng_max - self.lng_min)
|
| 201 |
+
return (np.clip(norm_lat, 0, 1), np.clip(norm_lng, 0, 1))
|
| 202 |
+
|
| 203 |
+
def encode(
|
| 204 |
+
self, embedding: np.ndarray, coordinates: Tuple[float, float]
|
| 205 |
+
) -> np.ndarray:
|
| 206 |
+
"""
|
| 207 |
+
Create hybrid semantic-spatial vector.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
embedding: Full semantic embedding (1024-dim)
|
| 211 |
+
coordinates: (lat, lng) tuple
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Hybrid vector of dimension (semantic_dims + 2)
|
| 215 |
+
"""
|
| 216 |
+
# Reduce semantic embedding
|
| 217 |
+
if embedding.ndim == 1 and len(embedding) > self.semantic_dims:
|
| 218 |
+
semantic = EmbeddingReducer.compute_sparse_projection(
|
| 219 |
+
embedding, self.semantic_dims
|
| 220 |
+
)
|
| 221 |
+
else:
|
| 222 |
+
semantic = embedding[: self.semantic_dims]
|
| 223 |
+
|
| 224 |
+
# Normalize spatial
|
| 225 |
+
norm_lat, norm_lng = self.normalize_coordinates(coordinates[0], coordinates[1])
|
| 226 |
+
spatial = np.array([norm_lat, norm_lng])
|
| 227 |
+
|
| 228 |
+
# Combine with weights
|
| 229 |
+
weighted_semantic = semantic * self.semantic_weight
|
| 230 |
+
weighted_spatial = spatial * self.spatial_weight
|
| 231 |
+
|
| 232 |
+
return np.concatenate([weighted_semantic, weighted_spatial])
|
| 233 |
+
|
| 234 |
+
def compute_similarity(
|
| 235 |
+
self, query_hybrid: np.ndarray, candidate_hybrid: np.ndarray
|
| 236 |
+
) -> float:
|
| 237 |
+
"""
|
| 238 |
+
Compute similarity between hybrid vectors.
|
| 239 |
+
|
| 240 |
+
Uses cosine similarity for semantic part and
|
| 241 |
+
inverse distance for spatial part.
|
| 242 |
+
"""
|
| 243 |
+
semantic_dims = self.semantic_dims
|
| 244 |
+
|
| 245 |
+
# Semantic similarity (cosine)
|
| 246 |
+
query_semantic = query_hybrid[:semantic_dims]
|
| 247 |
+
cand_semantic = candidate_hybrid[:semantic_dims]
|
| 248 |
+
|
| 249 |
+
dot = np.dot(query_semantic, cand_semantic)
|
| 250 |
+
norm_q = np.linalg.norm(query_semantic)
|
| 251 |
+
norm_c = np.linalg.norm(cand_semantic)
|
| 252 |
+
|
| 253 |
+
if norm_q > 0 and norm_c > 0:
|
| 254 |
+
semantic_sim = dot / (norm_q * norm_c)
|
| 255 |
+
else:
|
| 256 |
+
semantic_sim = 0.0
|
| 257 |
+
|
| 258 |
+
# Spatial similarity (inverse euclidean distance)
|
| 259 |
+
query_spatial = query_hybrid[semantic_dims:]
|
| 260 |
+
cand_spatial = candidate_hybrid[semantic_dims:]
|
| 261 |
+
|
| 262 |
+
spatial_dist = np.linalg.norm(query_spatial - cand_spatial)
|
| 263 |
+
spatial_sim = 1.0 / (1.0 + spatial_dist * 10) # Scale factor
|
| 264 |
+
|
| 265 |
+
# Combine
|
| 266 |
+
return self.semantic_weight * semantic_sim + self.spatial_weight * spatial_sim
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
# ============================================================================
|
| 270 |
+
# Singapore Planning Areas & Housing Data
|
| 271 |
+
# ============================================================================
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
class HousingType(str, Enum):
|
| 275 |
+
"""Singapore housing types with income proxy scores."""
|
| 276 |
+
|
| 277 |
+
HDB_1_2_ROOM = "hdb_1_2_room"
|
| 278 |
+
HDB_3_ROOM = "hdb_3_room"
|
| 279 |
+
HDB_4_ROOM = "hdb_4_room"
|
| 280 |
+
HDB_5_ROOM = "hdb_5_room"
|
| 281 |
+
HDB_EXECUTIVE = "hdb_executive"
|
| 282 |
+
CONDO = "condo"
|
| 283 |
+
LANDED = "landed"
|
| 284 |
+
GCB = "gcb" # Good Class Bungalow
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# Housing type to income proxy score (0-1)
|
| 288 |
+
HOUSING_INCOME_PROXY = {
|
| 289 |
+
HousingType.HDB_1_2_ROOM: 0.1,
|
| 290 |
+
HousingType.HDB_3_ROOM: 0.25,
|
| 291 |
+
HousingType.HDB_4_ROOM: 0.4,
|
| 292 |
+
HousingType.HDB_5_ROOM: 0.55,
|
| 293 |
+
HousingType.HDB_EXECUTIVE: 0.65,
|
| 294 |
+
HousingType.CONDO: 0.75,
|
| 295 |
+
HousingType.LANDED: 0.85,
|
| 296 |
+
HousingType.GCB: 1.0,
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
# Singapore Planning Areas with approximate centroids
|
| 300 |
+
PLANNING_AREAS = {
|
| 301 |
+
"ang_mo_kio": {"name": "Ang Mo Kio", "lat": 1.3691, "lng": 103.8454},
|
| 302 |
+
"bedok": {"name": "Bedok", "lat": 1.3236, "lng": 103.9273},
|
| 303 |
+
"bishan": {"name": "Bishan", "lat": 1.3526, "lng": 103.8352},
|
| 304 |
+
"bukit_batok": {"name": "Bukit Batok", "lat": 1.3590, "lng": 103.7637},
|
| 305 |
+
"bukit_merah": {"name": "Bukit Merah", "lat": 1.2819, "lng": 103.8239},
|
| 306 |
+
"bukit_panjang": {"name": "Bukit Panjang", "lat": 1.3774, "lng": 103.7719},
|
| 307 |
+
"bukit_timah": {"name": "Bukit Timah", "lat": 1.3294, "lng": 103.8021},
|
| 308 |
+
"central": {"name": "Central Area", "lat": 1.2789, "lng": 103.8536},
|
| 309 |
+
"choa_chu_kang": {"name": "Choa Chu Kang", "lat": 1.3840, "lng": 103.7470},
|
| 310 |
+
"clementi": {"name": "Clementi", "lat": 1.3162, "lng": 103.7649},
|
| 311 |
+
"geylang": {"name": "Geylang", "lat": 1.3201, "lng": 103.8918},
|
| 312 |
+
"hougang": {"name": "Hougang", "lat": 1.3612, "lng": 103.8863},
|
| 313 |
+
"jurong_east": {"name": "Jurong East", "lat": 1.3329, "lng": 103.7436},
|
| 314 |
+
"jurong_west": {"name": "Jurong West", "lat": 1.3404, "lng": 103.7090},
|
| 315 |
+
"kallang": {"name": "Kallang", "lat": 1.3100, "lng": 103.8651},
|
| 316 |
+
"marine_parade": {"name": "Marine Parade", "lat": 1.3020, "lng": 103.9072},
|
| 317 |
+
"novena": {"name": "Novena", "lat": 1.3204, "lng": 103.8438},
|
| 318 |
+
"orchard": {"name": "Orchard", "lat": 1.3048, "lng": 103.8318},
|
| 319 |
+
"pasir_ris": {"name": "Pasir Ris", "lat": 1.3721, "lng": 103.9474},
|
| 320 |
+
"punggol": {"name": "Punggol", "lat": 1.3984, "lng": 103.9072},
|
| 321 |
+
"queenstown": {"name": "Queenstown", "lat": 1.2942, "lng": 103.7861},
|
| 322 |
+
"sembawang": {"name": "Sembawang", "lat": 1.4491, "lng": 103.8185},
|
| 323 |
+
"sengkang": {"name": "Sengkang", "lat": 1.3868, "lng": 103.8914},
|
| 324 |
+
"serangoon": {"name": "Serangoon", "lat": 1.3554, "lng": 103.8679},
|
| 325 |
+
"tampines": {"name": "Tampines", "lat": 1.3496, "lng": 103.9568},
|
| 326 |
+
"toa_payoh": {"name": "Toa Payoh", "lat": 1.3343, "lng": 103.8563},
|
| 327 |
+
"woodlands": {"name": "Woodlands", "lat": 1.4382, "lng": 103.7891},
|
| 328 |
+
"yishun": {"name": "Yishun", "lat": 1.4304, "lng": 103.8354},
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
# ============================================================================
|
| 333 |
+
# Data Models
|
| 334 |
+
# ============================================================================
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
@dataclass
|
| 338 |
+
class ClientProfile:
|
| 339 |
+
"""Client/Donor profile with spatial and behavioral data.
|
| 340 |
+
|
| 341 |
+
Privacy considerations:
|
| 342 |
+
- user_id is a hashed identifier, not PII
|
| 343 |
+
- coordinates are reduced precision (~100m accuracy)
|
| 344 |
+
- name_encrypted would be encrypted in production
|
| 345 |
+
|
| 346 |
+
Embedding Strategy:
|
| 347 |
+
- embedding: Full 1024-dim vector for accuracy at scale
|
| 348 |
+
- embedding_reduced: 8-dim compact vector for small dataset matching
|
| 349 |
+
- hybrid_embedding: Semantic + spatial combined vector
|
| 350 |
+
"""
|
| 351 |
+
|
| 352 |
+
user_id: str
|
| 353 |
+
|
| 354 |
+
# Spatial data (reduced precision for privacy)
|
| 355 |
+
coordinates: Tuple[float, float] # (lat, lng) - 3 decimal precision
|
| 356 |
+
planning_area: str
|
| 357 |
+
housing_type: HousingType
|
| 358 |
+
|
| 359 |
+
# Behavioral/Interest data (embedded in vector)
|
| 360 |
+
interests: List[str]
|
| 361 |
+
causes: List[str]
|
| 362 |
+
preferred_language: str
|
| 363 |
+
|
| 364 |
+
# Donation history
|
| 365 |
+
is_donor: bool = False
|
| 366 |
+
total_donated: float = 0.0
|
| 367 |
+
last_donation_amount: float = 0.0
|
| 368 |
+
last_org_donated: Optional[str] = None
|
| 369 |
+
donation_count: int = 0
|
| 370 |
+
|
| 371 |
+
# Metadata (not embedded)
|
| 372 |
+
name_encrypted: Optional[str] = None # Would be encrypted in production
|
| 373 |
+
age_range: Optional[str] = None # e.g., "25-34", "35-44"
|
| 374 |
+
|
| 375 |
+
# Vector embeddings
|
| 376 |
+
embedding: Optional[List[float]] = None # Full 1024-dim
|
| 377 |
+
embedding_reduced: Optional[List[float]] = None # Reduced 8-dim
|
| 378 |
+
hybrid_embedding: Optional[List[float]] = None # Semantic + spatial (10-dim)
|
| 379 |
+
|
| 380 |
+
def to_embedding_text(self) -> str:
|
| 381 |
+
"""Convert profile to text for embedding generation."""
|
| 382 |
+
parts = [
|
| 383 |
+
f"Planning area: {self.planning_area}",
|
| 384 |
+
f"Housing: {self.housing_type.value}",
|
| 385 |
+
f"Interests: {', '.join(self.interests)}",
|
| 386 |
+
f"Causes: {', '.join(self.causes)}",
|
| 387 |
+
f"Language: {self.preferred_language}",
|
| 388 |
+
]
|
| 389 |
+
if self.is_donor:
|
| 390 |
+
parts.append(f"Donor with {self.donation_count} donations")
|
| 391 |
+
return "\n".join(parts)
|
| 392 |
+
|
| 393 |
+
def compute_reduced_embeddings(self, semantic_dims: int = 8) -> None:
|
| 394 |
+
"""
|
| 395 |
+
Compute reduced and hybrid embeddings from full embedding.
|
| 396 |
+
|
| 397 |
+
Call this after setting the full embedding.
|
| 398 |
+
"""
|
| 399 |
+
if self.embedding is None:
|
| 400 |
+
return
|
| 401 |
+
|
| 402 |
+
full_emb = np.array(self.embedding)
|
| 403 |
+
|
| 404 |
+
# Compute reduced embedding using sparse projection
|
| 405 |
+
reduced = EmbeddingReducer.compute_sparse_projection(full_emb, semantic_dims)
|
| 406 |
+
self.embedding_reduced = reduced.tolist()
|
| 407 |
+
|
| 408 |
+
# Compute hybrid embedding with spatial
|
| 409 |
+
encoder = HybridSemanticSpatialEncoder(semantic_dims=semantic_dims)
|
| 410 |
+
hybrid = encoder.encode(full_emb, self.coordinates)
|
| 411 |
+
self.hybrid_embedding = hybrid.tolist()
|
| 412 |
+
|
| 413 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 414 |
+
"""Convert to dictionary for JSON serialization."""
|
| 415 |
+
return {
|
| 416 |
+
"user_id": self.user_id,
|
| 417 |
+
"coordinates": list(self.coordinates),
|
| 418 |
+
"planning_area": self.planning_area,
|
| 419 |
+
"housing_type": self.housing_type.value,
|
| 420 |
+
"interests": self.interests,
|
| 421 |
+
"causes": self.causes,
|
| 422 |
+
"preferred_language": self.preferred_language,
|
| 423 |
+
"is_donor": self.is_donor,
|
| 424 |
+
"total_donated": self.total_donated,
|
| 425 |
+
"last_donation_amount": self.last_donation_amount,
|
| 426 |
+
"last_org_donated": self.last_org_donated,
|
| 427 |
+
"donation_count": self.donation_count,
|
| 428 |
+
"age_range": self.age_range,
|
| 429 |
+
"has_reduced_embedding": self.embedding_reduced is not None,
|
| 430 |
+
"has_hybrid_embedding": self.hybrid_embedding is not None,
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
@dataclass
|
| 435 |
+
class ScoredClient:
|
| 436 |
+
"""Client with computed targeting scores."""
|
| 437 |
+
|
| 438 |
+
client: ClientProfile
|
| 439 |
+
|
| 440 |
+
# Individual scores (0-1)
|
| 441 |
+
vector_similarity_score: float = 0.0
|
| 442 |
+
spatial_proxy_score: float = 0.0
|
| 443 |
+
proximity_score: float = 0.0
|
| 444 |
+
|
| 445 |
+
# Combined score
|
| 446 |
+
final_score: float = 0.0
|
| 447 |
+
|
| 448 |
+
# Distance from query (for debugging)
|
| 449 |
+
vector_distance: float = 0.0
|
| 450 |
+
geo_distance_km: float = 0.0
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
@dataclass
|
| 454 |
+
class GeoJSONFeature:
|
| 455 |
+
"""GeoJSON Feature for map visualization."""
|
| 456 |
+
|
| 457 |
+
type: str = "Feature"
|
| 458 |
+
geometry: Dict[str, Any] = field(default_factory=dict)
|
| 459 |
+
properties: Dict[str, Any] = field(default_factory=dict)
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
# ============================================================================
|
| 463 |
+
# GIS Recommender System
|
| 464 |
+
# ============================================================================
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
class GISRecommender:
|
| 468 |
+
"""
|
| 469 |
+
GIS-enhanced recommender using vector similarity + spatial targeting.
|
| 470 |
+
|
| 471 |
+
Features:
|
| 472 |
+
1. Lookalike retrieval using SEA-LION embeddings
|
| 473 |
+
2. Geo-fence filtering by planning area
|
| 474 |
+
3. Tiered scoring combining multiple signals
|
| 475 |
+
4. GeoJSON export for visualization
|
| 476 |
+
5. Hybrid semantic-spatial matching for small datasets
|
| 477 |
+
"""
|
| 478 |
+
|
| 479 |
+
def __init__(self, vector_store=None, encoder=None):
|
| 480 |
+
"""Initialize recommender with vector store and encoder."""
|
| 481 |
+
self.vector_store = vector_store
|
| 482 |
+
self.encoder = encoder
|
| 483 |
+
|
| 484 |
+
# Hybrid encoder for small dataset matching
|
| 485 |
+
self.hybrid_encoder = HybridSemanticSpatialEncoder(
|
| 486 |
+
semantic_dims=8, spatial_weight=0.3, semantic_weight=0.7
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
# Scoring weights (can be tuned)
|
| 490 |
+
self.weights = {
|
| 491 |
+
"vector_similarity": 0.5,
|
| 492 |
+
"spatial_proxy": 0.3,
|
| 493 |
+
"proximity": 0.2,
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
# Threshold for using hybrid matching
|
| 497 |
+
self.small_dataset_threshold = 100
|
| 498 |
+
|
| 499 |
+
@staticmethod
|
| 500 |
+
def haversine_distance(
|
| 501 |
+
coord1: Tuple[float, float], coord2: Tuple[float, float]
|
| 502 |
+
) -> float:
|
| 503 |
+
"""Calculate distance between two coordinates in kilometers."""
|
| 504 |
+
from math import radians, sin, cos, sqrt, atan2
|
| 505 |
+
|
| 506 |
+
lat1, lon1 = radians(coord1[0]), radians(coord1[1])
|
| 507 |
+
lat2, lon2 = radians(coord2[0]), radians(coord2[1])
|
| 508 |
+
|
| 509 |
+
dlat = lat2 - lat1
|
| 510 |
+
dlon = lon2 - lon1
|
| 511 |
+
|
| 512 |
+
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
|
| 513 |
+
c = 2 * atan2(sqrt(a), sqrt(1 - a))
|
| 514 |
+
|
| 515 |
+
# Earth's radius in km
|
| 516 |
+
return 6371 * c
|
| 517 |
+
|
| 518 |
+
@staticmethod
|
| 519 |
+
def reduce_coordinate_precision(
|
| 520 |
+
lat: float, lng: float, decimals: int = 3
|
| 521 |
+
) -> Tuple[float, float]:
|
| 522 |
+
"""Reduce coordinate precision for privacy (~100m at 3 decimals)."""
|
| 523 |
+
return (round(lat, decimals), round(lng, decimals))
|
| 524 |
+
|
| 525 |
+
def calculate_spatial_proxy_score(self, client: ClientProfile) -> float:
|
| 526 |
+
"""Calculate income proxy score based on housing type."""
|
| 527 |
+
return HOUSING_INCOME_PROXY.get(client.housing_type, 0.5)
|
| 528 |
+
|
| 529 |
+
def calculate_proximity_score(
|
| 530 |
+
self, client: ClientProfile, event_locations: List[Tuple[float, float]] = None
|
| 531 |
+
) -> float:
|
| 532 |
+
"""
|
| 533 |
+
Calculate proximity score based on distance to successful donation events.
|
| 534 |
+
|
| 535 |
+
Lower distance = higher score.
|
| 536 |
+
"""
|
| 537 |
+
if not event_locations:
|
| 538 |
+
return 0.5 # Default score if no events
|
| 539 |
+
|
| 540 |
+
# Find minimum distance to any event
|
| 541 |
+
min_distance = float("inf")
|
| 542 |
+
for event_coord in event_locations:
|
| 543 |
+
dist = self.haversine_distance(client.coordinates, event_coord)
|
| 544 |
+
min_distance = min(min_distance, dist)
|
| 545 |
+
|
| 546 |
+
# Convert distance to score (0-1)
|
| 547 |
+
# Max distance in Singapore ~40km, normalize accordingly
|
| 548 |
+
max_distance = 40.0
|
| 549 |
+
score = max(0, 1 - (min_distance / max_distance))
|
| 550 |
+
return score
|
| 551 |
+
|
| 552 |
+
def calculate_vector_similarity(self, distance: float) -> float:
|
| 553 |
+
"""Convert L2 distance to similarity score (0-1)."""
|
| 554 |
+
return 1.0 / (1.0 + distance)
|
| 555 |
+
|
| 556 |
+
def find_lookalikes_hybrid(
|
| 557 |
+
self,
|
| 558 |
+
seed_profile: ClientProfile,
|
| 559 |
+
candidates: List[ClientProfile],
|
| 560 |
+
k: int = 50,
|
| 561 |
+
planning_area_filter: Optional[str] = None,
|
| 562 |
+
housing_type_filter: Optional[List[HousingType]] = None,
|
| 563 |
+
) -> List[ScoredClient]:
|
| 564 |
+
"""
|
| 565 |
+
Find lookalikes using hybrid semantic-spatial matching.
|
| 566 |
+
|
| 567 |
+
This method is optimized for small datasets where pure vector
|
| 568 |
+
similarity may not work well due to sparse embeddings.
|
| 569 |
+
|
| 570 |
+
Args:
|
| 571 |
+
seed_profile: The "ideal donor" profile to match against
|
| 572 |
+
candidates: List of candidate client profiles
|
| 573 |
+
k: Number of neighbors to retrieve
|
| 574 |
+
planning_area_filter: Optional geo-fence filter
|
| 575 |
+
housing_type_filter: Optional housing type filter
|
| 576 |
+
|
| 577 |
+
Returns:
|
| 578 |
+
List of ScoredClient objects ranked by hybrid similarity
|
| 579 |
+
"""
|
| 580 |
+
if not seed_profile.embedding:
|
| 581 |
+
# Generate a mock embedding based on profile text
|
| 582 |
+
seed_profile.embedding = self._generate_fallback_embedding(seed_profile)
|
| 583 |
+
|
| 584 |
+
# Compute hybrid embedding for seed
|
| 585 |
+
seed_emb = np.array(seed_profile.embedding)
|
| 586 |
+
seed_hybrid = self.hybrid_encoder.encode(seed_emb, seed_profile.coordinates)
|
| 587 |
+
|
| 588 |
+
scored_clients = []
|
| 589 |
+
|
| 590 |
+
for client in candidates:
|
| 591 |
+
# Apply filters
|
| 592 |
+
if planning_area_filter and client.planning_area != planning_area_filter:
|
| 593 |
+
continue
|
| 594 |
+
|
| 595 |
+
if housing_type_filter:
|
| 596 |
+
if client.housing_type not in housing_type_filter:
|
| 597 |
+
continue
|
| 598 |
+
|
| 599 |
+
# Ensure client has embedding
|
| 600 |
+
if not client.embedding:
|
| 601 |
+
client.embedding = self._generate_fallback_embedding(client)
|
| 602 |
+
|
| 603 |
+
# Compute hybrid embedding for candidate
|
| 604 |
+
cand_emb = np.array(client.embedding)
|
| 605 |
+
cand_hybrid = self.hybrid_encoder.encode(cand_emb, client.coordinates)
|
| 606 |
+
|
| 607 |
+
# Compute hybrid similarity
|
| 608 |
+
hybrid_sim = self.hybrid_encoder.compute_similarity(
|
| 609 |
+
seed_hybrid, cand_hybrid
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
# Calculate other scores
|
| 613 |
+
spatial_score = self.calculate_spatial_proxy_score(client)
|
| 614 |
+
geo_dist = self.haversine_distance(
|
| 615 |
+
seed_profile.coordinates, client.coordinates
|
| 616 |
+
)
|
| 617 |
+
proximity_score = max(0, 1 - (geo_dist / 40.0))
|
| 618 |
+
|
| 619 |
+
# Weighted final score
|
| 620 |
+
final_score = (
|
| 621 |
+
0.6 * hybrid_sim # Higher weight on hybrid similarity
|
| 622 |
+
+ 0.2 * spatial_score
|
| 623 |
+
+ 0.2 * proximity_score
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
scored_clients.append(
|
| 627 |
+
ScoredClient(
|
| 628 |
+
client=client,
|
| 629 |
+
vector_similarity_score=hybrid_sim,
|
| 630 |
+
spatial_proxy_score=spatial_score,
|
| 631 |
+
proximity_score=proximity_score,
|
| 632 |
+
final_score=final_score,
|
| 633 |
+
vector_distance=1 - hybrid_sim,
|
| 634 |
+
geo_distance_km=geo_dist,
|
| 635 |
+
)
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
# Sort by final score
|
| 639 |
+
scored_clients.sort(key=lambda x: x.final_score, reverse=True)
|
| 640 |
+
return scored_clients[:k]
|
| 641 |
+
|
| 642 |
+
def _generate_fallback_embedding(self, profile: ClientProfile) -> List[float]:
|
| 643 |
+
"""
|
| 644 |
+
Generate a deterministic fallback embedding when encoder is unavailable.
|
| 645 |
+
|
| 646 |
+
Uses a hash of profile features to create a pseudo-embedding.
|
| 647 |
+
This ensures consistent matching even without the actual encoder.
|
| 648 |
+
"""
|
| 649 |
+
# Create a feature string
|
| 650 |
+
features = [
|
| 651 |
+
profile.planning_area,
|
| 652 |
+
profile.housing_type.value,
|
| 653 |
+
",".join(sorted(profile.interests)),
|
| 654 |
+
",".join(sorted(profile.causes)),
|
| 655 |
+
profile.preferred_language,
|
| 656 |
+
str(profile.is_donor),
|
| 657 |
+
]
|
| 658 |
+
feature_str = "|".join(features)
|
| 659 |
+
|
| 660 |
+
# Use hash to generate pseudo-random but deterministic values
|
| 661 |
+
hash_bytes = hashlib.sha256(feature_str.encode()).digest()
|
| 662 |
+
|
| 663 |
+
# Expand hash to 1024 dimensions using multiple rounds
|
| 664 |
+
embedding = []
|
| 665 |
+
for i in range(64): # 64 rounds of 16 values each = 1024
|
| 666 |
+
seed = int.from_bytes(hash_bytes, "big") + i
|
| 667 |
+
np.random.seed(seed % (2**32))
|
| 668 |
+
chunk = np.random.randn(16) * 0.1
|
| 669 |
+
embedding.extend(chunk.tolist())
|
| 670 |
+
|
| 671 |
+
# Normalize
|
| 672 |
+
emb_array = np.array(embedding[:1024])
|
| 673 |
+
norm = np.linalg.norm(emb_array)
|
| 674 |
+
if norm > 0:
|
| 675 |
+
emb_array = emb_array / norm
|
| 676 |
+
|
| 677 |
+
return emb_array.tolist()
|
| 678 |
+
|
| 679 |
+
def _form_data_to_client_profile(
|
| 680 |
+
self, user_id: str, form_data: Dict[str, Any], form_type: str
|
| 681 |
+
) -> ClientProfile:
|
| 682 |
+
"""
|
| 683 |
+
Convert form data from database to ClientProfile.
|
| 684 |
+
|
| 685 |
+
Handles both donor forms (from /donors/register) and client forms
|
| 686 |
+
(from /clients/register) which have different field structures.
|
| 687 |
+
|
| 688 |
+
Donor forms have: name, donor_type, country, preferred_language, causes,
|
| 689 |
+
donation_frequency, amount_range, bio, motivation
|
| 690 |
+
Client forms have: coordinates, planning_area, housing_type, interests,
|
| 691 |
+
causes, preferred_language, is_donor, etc.
|
| 692 |
+
|
| 693 |
+
For donors without GIS data, we infer reasonable defaults based on
|
| 694 |
+
available information.
|
| 695 |
+
"""
|
| 696 |
+
import random
|
| 697 |
+
|
| 698 |
+
# Check if this is a donor form (different structure)
|
| 699 |
+
is_donor_form = form_type == "donor" or "donor_type" in form_data
|
| 700 |
+
|
| 701 |
+
if is_donor_form:
|
| 702 |
+
# Convert donor form data to client profile
|
| 703 |
+
# Infer GIS data from available information
|
| 704 |
+
|
| 705 |
+
# Get country and infer planning area
|
| 706 |
+
country = form_data.get("country", "SG")
|
| 707 |
+
|
| 708 |
+
# Assign a random planning area (in production, could use IP geolocation)
|
| 709 |
+
if country == "SG":
|
| 710 |
+
planning_areas = list(PLANNING_AREAS.keys())
|
| 711 |
+
# Use hash of user_id for deterministic assignment
|
| 712 |
+
area_idx = hash(user_id) % len(planning_areas)
|
| 713 |
+
planning_area = planning_areas[area_idx]
|
| 714 |
+
area_info = PLANNING_AREAS[planning_area]
|
| 715 |
+
# Add small random offset for privacy
|
| 716 |
+
random.seed(hash(user_id))
|
| 717 |
+
lat = area_info["lat"] + random.uniform(-0.003, 0.003)
|
| 718 |
+
lng = area_info["lng"] + random.uniform(-0.003, 0.003)
|
| 719 |
+
coordinates = (round(lat, 4), round(lng, 4))
|
| 720 |
+
else:
|
| 721 |
+
# Non-SG donors - use central SG as placeholder
|
| 722 |
+
planning_area = "central"
|
| 723 |
+
coordinates = (1.2897, 103.8501)
|
| 724 |
+
|
| 725 |
+
# Infer housing type from amount_range (income proxy)
|
| 726 |
+
amount_range = form_data.get("amount_range", "")
|
| 727 |
+
if "5000" in amount_range or "10000" in amount_range:
|
| 728 |
+
housing_type = HousingType.LANDED
|
| 729 |
+
elif "2000" in amount_range or "3000" in amount_range:
|
| 730 |
+
housing_type = HousingType.CONDO
|
| 731 |
+
elif "1000" in amount_range:
|
| 732 |
+
housing_type = HousingType.HDB_EXECUTIVE
|
| 733 |
+
elif "500" in amount_range:
|
| 734 |
+
housing_type = HousingType.HDB_5_ROOM
|
| 735 |
+
elif "100" in amount_range or "200" in amount_range:
|
| 736 |
+
housing_type = HousingType.HDB_4_ROOM
|
| 737 |
+
else:
|
| 738 |
+
# Default based on donor_type
|
| 739 |
+
donor_type = form_data.get("donor_type", "individual")
|
| 740 |
+
if donor_type == "corporate":
|
| 741 |
+
housing_type = HousingType.CONDO # Proxy for corporate
|
| 742 |
+
elif donor_type == "foundation":
|
| 743 |
+
housing_type = HousingType.LANDED # High value
|
| 744 |
+
else:
|
| 745 |
+
housing_type = HousingType.HDB_4_ROOM
|
| 746 |
+
|
| 747 |
+
# Get causes and infer interests from bio/motivation
|
| 748 |
+
causes = form_data.get("causes", [])
|
| 749 |
+
|
| 750 |
+
# Extract interests from bio and motivation text
|
| 751 |
+
bio = form_data.get("bio", "")
|
| 752 |
+
motivation = form_data.get("motivation", "")
|
| 753 |
+
combined_text = f"{bio} {motivation}".lower()
|
| 754 |
+
|
| 755 |
+
interest_keywords = {
|
| 756 |
+
"technology": ["tech", "software", "digital", "innovation", "startup"],
|
| 757 |
+
"sustainability": [
|
| 758 |
+
"green",
|
| 759 |
+
"sustainable",
|
| 760 |
+
"climate",
|
| 761 |
+
"environment",
|
| 762 |
+
"eco",
|
| 763 |
+
],
|
| 764 |
+
"finance": ["finance", "banking", "investment", "money", "economic"],
|
| 765 |
+
"healthcare": ["health", "medical", "hospital", "wellness", "care"],
|
| 766 |
+
"education": ["education", "school", "learning", "teach", "university"],
|
| 767 |
+
"community": [
|
| 768 |
+
"community",
|
| 769 |
+
"local",
|
| 770 |
+
"neighborhood",
|
| 771 |
+
"social",
|
| 772 |
+
"volunteer",
|
| 773 |
+
],
|
| 774 |
+
"arts": ["art", "culture", "music", "creative", "design"],
|
| 775 |
+
}
|
| 776 |
+
|
| 777 |
+
interests = []
|
| 778 |
+
for interest, keywords in interest_keywords.items():
|
| 779 |
+
if any(kw in combined_text for kw in keywords):
|
| 780 |
+
interests.append(interest)
|
| 781 |
+
|
| 782 |
+
# Add causes as interests too (overlap is fine)
|
| 783 |
+
for cause in causes:
|
| 784 |
+
if cause not in interests:
|
| 785 |
+
interests.append(cause)
|
| 786 |
+
|
| 787 |
+
return ClientProfile(
|
| 788 |
+
user_id=user_id,
|
| 789 |
+
coordinates=coordinates,
|
| 790 |
+
planning_area=planning_area,
|
| 791 |
+
housing_type=housing_type,
|
| 792 |
+
interests=interests[:5], # Limit to 5
|
| 793 |
+
causes=causes,
|
| 794 |
+
preferred_language=form_data.get("preferred_language", "en"),
|
| 795 |
+
is_donor=True, # Came from donor registration
|
| 796 |
+
total_donated=0, # Unknown for new donors
|
| 797 |
+
donation_count=0,
|
| 798 |
+
age_range=None,
|
| 799 |
+
)
|
| 800 |
+
else:
|
| 801 |
+
# Client form - has GIS data directly
|
| 802 |
+
return ClientProfile(
|
| 803 |
+
user_id=user_id,
|
| 804 |
+
coordinates=tuple(form_data.get("coordinates", [1.3521, 103.8198])),
|
| 805 |
+
planning_area=form_data.get("planning_area", "central"),
|
| 806 |
+
housing_type=HousingType(form_data.get("housing_type", "hdb_4_room")),
|
| 807 |
+
interests=form_data.get("interests", []),
|
| 808 |
+
causes=form_data.get("causes", []),
|
| 809 |
+
preferred_language=form_data.get("preferred_language", "en"),
|
| 810 |
+
is_donor=form_data.get("is_donor", False),
|
| 811 |
+
total_donated=form_data.get("total_donated", 0),
|
| 812 |
+
donation_count=form_data.get("donation_count", 0),
|
| 813 |
+
age_range=form_data.get("age_range"),
|
| 814 |
+
)
|
| 815 |
+
|
| 816 |
+
async def find_lookalikes(
|
| 817 |
+
self,
|
| 818 |
+
seed_profile: ClientProfile,
|
| 819 |
+
k: int = 50,
|
| 820 |
+
planning_area_filter: Optional[str] = None,
|
| 821 |
+
housing_type_filter: Optional[List[HousingType]] = None,
|
| 822 |
+
use_hybrid: bool = False,
|
| 823 |
+
fallback_candidates: Optional[List[ClientProfile]] = None,
|
| 824 |
+
) -> List[ScoredClient]:
|
| 825 |
+
"""
|
| 826 |
+
Find top-K lookalikes for a seed donor profile.
|
| 827 |
+
|
| 828 |
+
Args:
|
| 829 |
+
seed_profile: The "ideal donor" profile to match against
|
| 830 |
+
k: Number of neighbors to retrieve
|
| 831 |
+
planning_area_filter: Optional geo-fence filter
|
| 832 |
+
housing_type_filter: Optional housing type filter
|
| 833 |
+
use_hybrid: Force hybrid matching (good for small datasets)
|
| 834 |
+
fallback_candidates: Candidates to use if vector store returns nothing
|
| 835 |
+
|
| 836 |
+
Returns:
|
| 837 |
+
List of ScoredClient objects ranked by similarity
|
| 838 |
+
"""
|
| 839 |
+
# Check if we should use hybrid matching
|
| 840 |
+
if use_hybrid and fallback_candidates:
|
| 841 |
+
return self.find_lookalikes_hybrid(
|
| 842 |
+
seed_profile=seed_profile,
|
| 843 |
+
candidates=fallback_candidates,
|
| 844 |
+
k=k,
|
| 845 |
+
planning_area_filter=planning_area_filter,
|
| 846 |
+
housing_type_filter=housing_type_filter,
|
| 847 |
+
)
|
| 848 |
+
|
| 849 |
+
if not self.encoder or not self.vector_store:
|
| 850 |
+
# No encoder/store - use hybrid with fallback candidates
|
| 851 |
+
if fallback_candidates:
|
| 852 |
+
return self.find_lookalikes_hybrid(
|
| 853 |
+
seed_profile=seed_profile,
|
| 854 |
+
candidates=fallback_candidates,
|
| 855 |
+
k=k,
|
| 856 |
+
planning_area_filter=planning_area_filter,
|
| 857 |
+
housing_type_filter=housing_type_filter,
|
| 858 |
+
)
|
| 859 |
+
raise ValueError(
|
| 860 |
+
"Encoder and vector store must be initialized, or provide fallback_candidates"
|
| 861 |
+
)
|
| 862 |
+
|
| 863 |
+
# Generate embedding for seed profile
|
| 864 |
+
seed_text = seed_profile.to_embedding_text()
|
| 865 |
+
seed_embedding = await self.encoder.encode(seed_text)
|
| 866 |
+
|
| 867 |
+
# Query vector store - search for BOTH donors and clients
|
| 868 |
+
# Donors registered via /donors/register have form_type="donor"
|
| 869 |
+
# Clients registered via /clients/register have form_type="client"
|
| 870 |
+
all_results = []
|
| 871 |
+
|
| 872 |
+
# Search for donors first (main source of potential clients for donees)
|
| 873 |
+
donor_results = await self.vector_store.find_similar(
|
| 874 |
+
query_embedding=seed_embedding,
|
| 875 |
+
form_type="donor",
|
| 876 |
+
limit=k * 2,
|
| 877 |
+
country_filter="SG",
|
| 878 |
+
)
|
| 879 |
+
all_results.extend(donor_results)
|
| 880 |
+
|
| 881 |
+
# Also search for clients (if any registered via client endpoint)
|
| 882 |
+
client_results = await self.vector_store.find_similar(
|
| 883 |
+
query_embedding=seed_embedding,
|
| 884 |
+
form_type="client",
|
| 885 |
+
limit=k * 2,
|
| 886 |
+
country_filter="SG",
|
| 887 |
+
)
|
| 888 |
+
all_results.extend(client_results)
|
| 889 |
+
|
| 890 |
+
# Deduplicate by ID and sort by distance
|
| 891 |
+
seen_ids = set()
|
| 892 |
+
results = []
|
| 893 |
+
for r in sorted(all_results, key=lambda x: x.distance):
|
| 894 |
+
if r.id not in seen_ids:
|
| 895 |
+
seen_ids.add(r.id)
|
| 896 |
+
results.append(r)
|
| 897 |
+
|
| 898 |
+
scored_clients = []
|
| 899 |
+
for result in results:
|
| 900 |
+
# Reconstruct client profile from form_data
|
| 901 |
+
form_data = result.form_data
|
| 902 |
+
|
| 903 |
+
# Apply planning area filter
|
| 904 |
+
if planning_area_filter:
|
| 905 |
+
if form_data.get("planning_area") != planning_area_filter:
|
| 906 |
+
continue
|
| 907 |
+
|
| 908 |
+
# Apply housing type filter
|
| 909 |
+
if housing_type_filter:
|
| 910 |
+
client_housing = form_data.get("housing_type")
|
| 911 |
+
if client_housing not in [h.value for h in housing_type_filter]:
|
| 912 |
+
continue
|
| 913 |
+
|
| 914 |
+
# Create client profile from form_data
|
| 915 |
+
# Handle both donor forms (different fields) and client forms
|
| 916 |
+
client = self._form_data_to_client_profile(
|
| 917 |
+
result.id, form_data, result.form_type
|
| 918 |
+
)
|
| 919 |
+
|
| 920 |
+
# Calculate scores
|
| 921 |
+
vector_score = self.calculate_vector_similarity(result.distance)
|
| 922 |
+
spatial_score = self.calculate_spatial_proxy_score(client)
|
| 923 |
+
proximity_score = 0.5 # Default, can be enhanced with event data
|
| 924 |
+
|
| 925 |
+
# Calculate final weighted score
|
| 926 |
+
final_score = (
|
| 927 |
+
self.weights["vector_similarity"] * vector_score
|
| 928 |
+
+ self.weights["spatial_proxy"] * spatial_score
|
| 929 |
+
+ self.weights["proximity"] * proximity_score
|
| 930 |
+
)
|
| 931 |
+
|
| 932 |
+
scored_clients.append(
|
| 933 |
+
ScoredClient(
|
| 934 |
+
client=client,
|
| 935 |
+
vector_similarity_score=vector_score,
|
| 936 |
+
spatial_proxy_score=spatial_score,
|
| 937 |
+
proximity_score=proximity_score,
|
| 938 |
+
final_score=final_score,
|
| 939 |
+
vector_distance=result.distance,
|
| 940 |
+
)
|
| 941 |
+
)
|
| 942 |
+
|
| 943 |
+
# Sort by final score and return top K
|
| 944 |
+
scored_clients.sort(key=lambda x: x.final_score, reverse=True)
|
| 945 |
+
return scored_clients[:k]
|
| 946 |
+
|
| 947 |
+
def apply_tiered_targeting(
|
| 948 |
+
self, clients: List[ScoredClient], min_score: float = 0.0, tiers: int = 3
|
| 949 |
+
) -> Dict[str, List[ScoredClient]]:
|
| 950 |
+
"""
|
| 951 |
+
Apply tiered targeting to segment clients.
|
| 952 |
+
|
| 953 |
+
Returns clients grouped into tiers:
|
| 954 |
+
- Tier 1: High priority (top third)
|
| 955 |
+
- Tier 2: Medium priority (middle third)
|
| 956 |
+
- Tier 3: Lower priority (bottom third)
|
| 957 |
+
"""
|
| 958 |
+
# Filter by minimum score
|
| 959 |
+
filtered = [c for c in clients if c.final_score >= min_score]
|
| 960 |
+
|
| 961 |
+
if not filtered:
|
| 962 |
+
return {"tier_1": [], "tier_2": [], "tier_3": []}
|
| 963 |
+
|
| 964 |
+
# Calculate tier boundaries
|
| 965 |
+
n = len(filtered)
|
| 966 |
+
tier_size = n // tiers
|
| 967 |
+
|
| 968 |
+
return {
|
| 969 |
+
"tier_1": filtered[:tier_size],
|
| 970 |
+
"tier_2": filtered[tier_size : tier_size * 2],
|
| 971 |
+
"tier_3": filtered[tier_size * 2 :],
|
| 972 |
+
}
|
| 973 |
+
|
| 974 |
+
def to_geojson(self, scored_clients: List[ScoredClient]) -> Dict[str, Any]:
|
| 975 |
+
"""
|
| 976 |
+
Convert scored clients to GeoJSON for map visualization.
|
| 977 |
+
|
| 978 |
+
Note: Coordinates are reduced precision for privacy.
|
| 979 |
+
"""
|
| 980 |
+
features = []
|
| 981 |
+
|
| 982 |
+
for sc in scored_clients:
|
| 983 |
+
# Reduce coordinate precision for privacy
|
| 984 |
+
lat, lng = self.reduce_coordinate_precision(
|
| 985 |
+
sc.client.coordinates[0], sc.client.coordinates[1]
|
| 986 |
+
)
|
| 987 |
+
|
| 988 |
+
feature = {
|
| 989 |
+
"type": "Feature",
|
| 990 |
+
"geometry": {
|
| 991 |
+
"type": "Point",
|
| 992 |
+
"coordinates": [lng, lat], # GeoJSON is [lng, lat]
|
| 993 |
+
},
|
| 994 |
+
"properties": {
|
| 995 |
+
"user_id": sc.client.user_id,
|
| 996 |
+
"planning_area": sc.client.planning_area,
|
| 997 |
+
"housing_type": sc.client.housing_type.value,
|
| 998 |
+
"causes": sc.client.causes,
|
| 999 |
+
"is_donor": sc.client.is_donor,
|
| 1000 |
+
"final_score": round(sc.final_score, 3),
|
| 1001 |
+
"vector_similarity": round(sc.vector_similarity_score, 3),
|
| 1002 |
+
"spatial_proxy": round(sc.spatial_proxy_score, 3),
|
| 1003 |
+
"proximity": round(sc.proximity_score, 3),
|
| 1004 |
+
# Exclude PII like name, exact address
|
| 1005 |
+
},
|
| 1006 |
+
}
|
| 1007 |
+
features.append(feature)
|
| 1008 |
+
|
| 1009 |
+
return {"type": "FeatureCollection", "features": features}
|
| 1010 |
+
|
| 1011 |
+
|
| 1012 |
+
# ============================================================================
|
| 1013 |
+
# Mock Data Generator (for demonstration)
|
| 1014 |
+
# ============================================================================
|
| 1015 |
+
|
| 1016 |
+
# Singapore-style names (multi-ethnic: Chinese, Malay, Indian, Eurasian)
|
| 1017 |
+
_FIRST_NAMES_CHINESE = [
|
| 1018 |
+
"Wei Ling", "Jia Hui", "Xiu Mei", "Zhi Wei", "Mei Ling", "Jun Jie",
|
| 1019 |
+
"Xiao Ming", "Yu Yan", "Jing Yi", "Zhi Hao", "Hui Min", "Kai Wen",
|
| 1020 |
+
"Shi Min", "Yi Xuan", "Jia Ying", "Wen Hui", "Li Hua", "Xin Yi",
|
| 1021 |
+
"Jia Min", "Zhi Xuan", "Shu Ting", "Wei Jie", "Pei Shan", "Jun Wei",
|
| 1022 |
+
]
|
| 1023 |
+
_SURNAMES_CHINESE = [
|
| 1024 |
+
"Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Goh", "Chua", "Chan", "Koh",
|
| 1025 |
+
"Teo", "Ang", "Yeo", "Tay", "Ho", "Low", "Sim", "Chong", "Leong", "Foo",
|
| 1026 |
+
]
|
| 1027 |
+
|
| 1028 |
+
_FIRST_NAMES_MALAY = [
|
| 1029 |
+
"Ahmad", "Muhammad", "Fatimah", "Siti", "Nur", "Aisyah", "Hafiz",
|
| 1030 |
+
"Amirah", "Farah", "Haziq", "Iman", "Zulkifli", "Rashid", "Nurul",
|
| 1031 |
+
"Hakim", "Syahira", "Irfan", "Liyana", "Danial", "Ain",
|
| 1032 |
+
]
|
| 1033 |
+
_SURNAMES_MALAY = [
|
| 1034 |
+
"bin Abdullah", "binti Ismail", "bin Rahman", "binti Hassan",
|
| 1035 |
+
"bin Osman", "binti Ahmad", "bin Yusof", "binti Mohamed",
|
| 1036 |
+
"bin Ibrahim", "binti Ali", "bin Hamid", "binti Zainal",
|
| 1037 |
+
]
|
| 1038 |
+
|
| 1039 |
+
_FIRST_NAMES_INDIAN = [
|
| 1040 |
+
"Priya", "Raj", "Ananya", "Arjun", "Kavitha", "Suresh", "Deepa",
|
| 1041 |
+
"Vijay", "Lakshmi", "Rahul", "Nirmala", "Sanjay", "Meena", "Arun",
|
| 1042 |
+
"Revathi", "Ganesh", "Shanti", "Kumar", "Devi", "Ravi",
|
| 1043 |
+
]
|
| 1044 |
+
_SURNAMES_INDIAN = [
|
| 1045 |
+
"Krishnan", "Pillai", "Nair", "Menon", "Rajan", "Sharma", "Patel",
|
| 1046 |
+
"Subramaniam", "Narayanan", "Chandran", "Gopal", "Muthu", "Samy",
|
| 1047 |
+
]
|
| 1048 |
+
|
| 1049 |
+
_FIRST_NAMES_EURASIAN = [
|
| 1050 |
+
"Daniel", "Sarah", "Michael", "Rachel", "David", "Michelle", "James",
|
| 1051 |
+
"Vanessa", "Mark", "Stephanie", "Paul", "Amanda", "Brian", "Nicole",
|
| 1052 |
+
]
|
| 1053 |
+
_SURNAMES_EURASIAN = [
|
| 1054 |
+
"De Souza", "Pereira", "Rodrigues", "Fernandes", "Da Costa",
|
| 1055 |
+
"Oliveira", "Sequeira", "D'Cruz", "Shepherdson", "Westerhout",
|
| 1056 |
+
]
|
| 1057 |
+
|
| 1058 |
+
|
| 1059 |
+
def generate_singapore_name() -> str:
|
| 1060 |
+
"""Generate a random Singapore-style name reflecting local demographics."""
|
| 1061 |
+
import random
|
| 1062 |
+
|
| 1063 |
+
ethnicity = random.choices(
|
| 1064 |
+
["chinese", "malay", "indian", "eurasian"],
|
| 1065 |
+
weights=[0.74, 0.13, 0.09, 0.04] # Approximate Singapore demographics
|
| 1066 |
+
)[0]
|
| 1067 |
+
|
| 1068 |
+
if ethnicity == "chinese":
|
| 1069 |
+
return f"{random.choice(_SURNAMES_CHINESE)} {random.choice(_FIRST_NAMES_CHINESE)}"
|
| 1070 |
+
elif ethnicity == "malay":
|
| 1071 |
+
first = random.choice(_FIRST_NAMES_MALAY)
|
| 1072 |
+
surname = random.choice(_SURNAMES_MALAY)
|
| 1073 |
+
return f"{first} {surname}"
|
| 1074 |
+
elif ethnicity == "indian":
|
| 1075 |
+
return f"{random.choice(_FIRST_NAMES_INDIAN)} {random.choice(_SURNAMES_INDIAN)}"
|
| 1076 |
+
else:
|
| 1077 |
+
return f"{random.choice(_FIRST_NAMES_EURASIAN)} {random.choice(_SURNAMES_EURASIAN)}"
|
| 1078 |
+
|
| 1079 |
+
|
| 1080 |
+
def generate_mock_clients(n: int = 100) -> List[ClientProfile]:
|
| 1081 |
+
"""Generate mock client profiles for testing."""
|
| 1082 |
+
import random
|
| 1083 |
+
|
| 1084 |
+
used_names: set[str] = set()
|
| 1085 |
+
|
| 1086 |
+
def get_unique_name() -> str:
|
| 1087 |
+
"""Generate a unique Singapore name, adding suffix if needed."""
|
| 1088 |
+
base_name = generate_singapore_name()
|
| 1089 |
+
name = base_name
|
| 1090 |
+
suffix = 1
|
| 1091 |
+
while name in used_names:
|
| 1092 |
+
suffix += 1
|
| 1093 |
+
name = f"{base_name} ({suffix})"
|
| 1094 |
+
used_names.add(name)
|
| 1095 |
+
return name
|
| 1096 |
+
|
| 1097 |
+
interests_pool = [
|
| 1098 |
+
"technology",
|
| 1099 |
+
"sustainability",
|
| 1100 |
+
"finance",
|
| 1101 |
+
"healthcare",
|
| 1102 |
+
"education",
|
| 1103 |
+
"arts",
|
| 1104 |
+
"sports",
|
| 1105 |
+
"community",
|
| 1106 |
+
"environment",
|
| 1107 |
+
"innovation",
|
| 1108 |
+
"social_impact",
|
| 1109 |
+
"volunteering",
|
| 1110 |
+
"entrepreneurship",
|
| 1111 |
+
"wellness",
|
| 1112 |
+
]
|
| 1113 |
+
|
| 1114 |
+
causes_pool = [
|
| 1115 |
+
"education",
|
| 1116 |
+
"health",
|
| 1117 |
+
"environment",
|
| 1118 |
+
"poverty",
|
| 1119 |
+
"children",
|
| 1120 |
+
"elderly",
|
| 1121 |
+
"disability",
|
| 1122 |
+
"animals",
|
| 1123 |
+
"arts",
|
| 1124 |
+
"disaster_relief",
|
| 1125 |
+
"human_rights",
|
| 1126 |
+
"technology",
|
| 1127 |
+
"housing",
|
| 1128 |
+
]
|
| 1129 |
+
|
| 1130 |
+
languages = ["en", "zh", "ms", "ta", "th", "vi"]
|
| 1131 |
+
age_ranges = ["18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
|
| 1132 |
+
housing_types = list(HousingType)
|
| 1133 |
+
planning_areas = list(PLANNING_AREAS.keys())
|
| 1134 |
+
|
| 1135 |
+
clients = []
|
| 1136 |
+
|
| 1137 |
+
for i in range(n):
|
| 1138 |
+
# Select random planning area and add some noise to coordinates
|
| 1139 |
+
area_key = random.choice(planning_areas)
|
| 1140 |
+
area = PLANNING_AREAS[area_key]
|
| 1141 |
+
|
| 1142 |
+
# Add small random offset (within ~500m)
|
| 1143 |
+
lat = area["lat"] + random.uniform(-0.005, 0.005)
|
| 1144 |
+
lng = area["lng"] + random.uniform(-0.005, 0.005)
|
| 1145 |
+
|
| 1146 |
+
# Weighted housing type selection (more HDB in Singapore)
|
| 1147 |
+
housing_weights = [0.05, 0.15, 0.25, 0.2, 0.1, 0.15, 0.08, 0.02]
|
| 1148 |
+
housing = random.choices(housing_types, weights=housing_weights)[0]
|
| 1149 |
+
|
| 1150 |
+
# Random interests and causes
|
| 1151 |
+
interests = random.sample(interests_pool, random.randint(2, 5))
|
| 1152 |
+
causes = random.sample(causes_pool, random.randint(1, 4))
|
| 1153 |
+
|
| 1154 |
+
# Donor status (30% are donors)
|
| 1155 |
+
is_donor = random.random() < 0.3
|
| 1156 |
+
|
| 1157 |
+
client = ClientProfile(
|
| 1158 |
+
user_id=generate_singapore_name(),
|
| 1159 |
+
coordinates=(round(lat, 4), round(lng, 4)),
|
| 1160 |
+
planning_area=area_key,
|
| 1161 |
+
housing_type=housing,
|
| 1162 |
+
interests=interests,
|
| 1163 |
+
causes=causes,
|
| 1164 |
+
preferred_language=random.choice(languages),
|
| 1165 |
+
is_donor=is_donor,
|
| 1166 |
+
total_donated=random.uniform(50, 5000) if is_donor else 0,
|
| 1167 |
+
donation_count=random.randint(1, 20) if is_donor else 0,
|
| 1168 |
+
age_range=random.choice(age_ranges),
|
| 1169 |
+
)
|
| 1170 |
+
|
| 1171 |
+
# Generate fallback embedding and compute reduced versions
|
| 1172 |
+
recommender = GISRecommender()
|
| 1173 |
+
client.embedding = recommender._generate_fallback_embedding(client)
|
| 1174 |
+
client.compute_reduced_embeddings()
|
| 1175 |
+
|
| 1176 |
+
clients.append(client)
|
| 1177 |
+
|
| 1178 |
+
return clients
|
| 1179 |
+
|
| 1180 |
+
|
| 1181 |
+
def generate_seed_donor_profile(cause: str = "education") -> ClientProfile:
|
| 1182 |
+
"""Generate an ideal donor profile for lookalike search."""
|
| 1183 |
+
profile = ClientProfile(
|
| 1184 |
+
user_id="seed_donor",
|
| 1185 |
+
coordinates=(1.3048, 103.8318), # Orchard area
|
| 1186 |
+
planning_area="orchard",
|
| 1187 |
+
housing_type=HousingType.CONDO,
|
| 1188 |
+
interests=["sustainability", "social_impact", "community"],
|
| 1189 |
+
causes=[cause, "children"],
|
| 1190 |
+
preferred_language="en",
|
| 1191 |
+
is_donor=True,
|
| 1192 |
+
total_donated=2500.0,
|
| 1193 |
+
donation_count=12,
|
| 1194 |
+
age_range="35-44",
|
| 1195 |
+
)
|
| 1196 |
+
|
| 1197 |
+
# Generate fallback embedding and compute reduced versions
|
| 1198 |
+
recommender = GISRecommender()
|
| 1199 |
+
profile.embedding = recommender._generate_fallback_embedding(profile)
|
| 1200 |
+
profile.compute_reduced_embeddings()
|
| 1201 |
+
|
| 1202 |
+
return profile
|
recommender/vector_store.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Vector storage and retrieval for donor/volunteer embeddings.
|
| 2 |
+
|
| 3 |
+
Uses the existing my_embeddings table in Supabase with pgvector extension.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
from typing import List, Optional, Dict, Any, Union
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _parse_json_field(value: Union[str, dict, None]) -> dict:
|
| 13 |
+
"""Safely parse a JSON field that might already be a dict (psycopg3 auto-parses)."""
|
| 14 |
+
if value is None:
|
| 15 |
+
return {}
|
| 16 |
+
if isinstance(value, dict):
|
| 17 |
+
return value
|
| 18 |
+
if isinstance(value, str):
|
| 19 |
+
try:
|
| 20 |
+
return json.loads(value)
|
| 21 |
+
except json.JSONDecodeError:
|
| 22 |
+
return {}
|
| 23 |
+
return {}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class SimilarityResult:
|
| 28 |
+
"""Result from similarity search.
|
| 29 |
+
|
| 30 |
+
Attributes:
|
| 31 |
+
id: The source_id of the matched form.
|
| 32 |
+
form_data: The original form data as a dictionary.
|
| 33 |
+
score: Similarity score (higher is more similar).
|
| 34 |
+
form_type: Type of form ("donor" or "volunteer").
|
| 35 |
+
distance: Raw L2 distance from query.
|
| 36 |
+
"""
|
| 37 |
+
id: str
|
| 38 |
+
form_data: Dict[str, Any]
|
| 39 |
+
score: float
|
| 40 |
+
form_type: str
|
| 41 |
+
distance: float = 0.0
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class DonorVectorStore:
|
| 45 |
+
"""Vector storage and retrieval for donor/volunteer embeddings.
|
| 46 |
+
|
| 47 |
+
Uses the existing my_embeddings table schema:
|
| 48 |
+
- source_id: form ID
|
| 49 |
+
- chunk_index: always 0 (single embedding per form)
|
| 50 |
+
- text_content: JSON serialized form data
|
| 51 |
+
- metadata: {"form_type": "donor"|"volunteer", ...}
|
| 52 |
+
- embedding: VECTOR(1024)
|
| 53 |
+
|
| 54 |
+
Attributes:
|
| 55 |
+
pool: AsyncConnectionPool for database connections.
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
def __init__(self, pool):
|
| 59 |
+
"""Initialize vector store.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
pool: AsyncConnectionPool from psycopg_pool
|
| 63 |
+
"""
|
| 64 |
+
self.pool = pool
|
| 65 |
+
|
| 66 |
+
async def store_embedding(
|
| 67 |
+
self,
|
| 68 |
+
form_id: str,
|
| 69 |
+
form_type: str,
|
| 70 |
+
embedding: np.ndarray,
|
| 71 |
+
form_data: Dict[str, Any]
|
| 72 |
+
) -> int:
|
| 73 |
+
"""Store form embedding in my_embeddings table.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
form_id: Unique identifier for the form.
|
| 77 |
+
form_type: Type of form ("donor" or "volunteer").
|
| 78 |
+
embedding: The 1024-dimensional embedding vector.
|
| 79 |
+
form_data: Original form data to store.
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
The database ID of the inserted record.
|
| 83 |
+
"""
|
| 84 |
+
embedding_list = embedding.tolist()
|
| 85 |
+
form_json = json.dumps(form_data, default=str)
|
| 86 |
+
|
| 87 |
+
async with self.pool.connection() as conn:
|
| 88 |
+
async with conn.cursor() as cur:
|
| 89 |
+
await cur.execute(
|
| 90 |
+
"""
|
| 91 |
+
INSERT INTO my_embeddings
|
| 92 |
+
(source_id, chunk_index, text_content, metadata, embedding)
|
| 93 |
+
VALUES (%s, %s, %s, %s, %s::vector)
|
| 94 |
+
RETURNING id
|
| 95 |
+
""",
|
| 96 |
+
(
|
| 97 |
+
form_id,
|
| 98 |
+
0, # Single embedding per form
|
| 99 |
+
form_json,
|
| 100 |
+
json.dumps({"form_type": form_type}),
|
| 101 |
+
embedding_list
|
| 102 |
+
)
|
| 103 |
+
)
|
| 104 |
+
result = await cur.fetchone()
|
| 105 |
+
return result[0]
|
| 106 |
+
|
| 107 |
+
async def update_embedding(
|
| 108 |
+
self,
|
| 109 |
+
form_id: str,
|
| 110 |
+
embedding: np.ndarray,
|
| 111 |
+
form_data: Optional[Dict[str, Any]] = None
|
| 112 |
+
) -> bool:
|
| 113 |
+
"""Update an existing embedding.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
form_id: The form ID to update.
|
| 117 |
+
embedding: New embedding vector.
|
| 118 |
+
form_data: Optional updated form data.
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
True if update succeeded, False if record not found.
|
| 122 |
+
"""
|
| 123 |
+
embedding_list = embedding.tolist()
|
| 124 |
+
|
| 125 |
+
async with self.pool.connection() as conn:
|
| 126 |
+
async with conn.cursor() as cur:
|
| 127 |
+
if form_data:
|
| 128 |
+
form_json = json.dumps(form_data, default=str)
|
| 129 |
+
await cur.execute(
|
| 130 |
+
"""
|
| 131 |
+
UPDATE my_embeddings
|
| 132 |
+
SET embedding = %s::vector, text_content = %s
|
| 133 |
+
WHERE source_id = %s
|
| 134 |
+
""",
|
| 135 |
+
(embedding_list, form_json, form_id)
|
| 136 |
+
)
|
| 137 |
+
else:
|
| 138 |
+
await cur.execute(
|
| 139 |
+
"""
|
| 140 |
+
UPDATE my_embeddings
|
| 141 |
+
SET embedding = %s::vector
|
| 142 |
+
WHERE source_id = %s
|
| 143 |
+
""",
|
| 144 |
+
(embedding_list, form_id)
|
| 145 |
+
)
|
| 146 |
+
return cur.rowcount > 0
|
| 147 |
+
|
| 148 |
+
async def delete_embedding(self, form_id: str) -> bool:
|
| 149 |
+
"""Delete an embedding by form ID.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
form_id: The form ID to delete.
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
True if deletion succeeded, False if record not found.
|
| 156 |
+
"""
|
| 157 |
+
async with self.pool.connection() as conn:
|
| 158 |
+
async with conn.cursor() as cur:
|
| 159 |
+
await cur.execute(
|
| 160 |
+
"DELETE FROM my_embeddings WHERE source_id = %s",
|
| 161 |
+
(form_id,)
|
| 162 |
+
)
|
| 163 |
+
return cur.rowcount > 0
|
| 164 |
+
|
| 165 |
+
async def get_embedding(self, form_id: str) -> Optional[SimilarityResult]:
|
| 166 |
+
"""Get a specific embedding by form ID.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
form_id: The form ID to retrieve.
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
SimilarityResult if found, None otherwise.
|
| 173 |
+
"""
|
| 174 |
+
async with self.pool.connection() as conn:
|
| 175 |
+
async with conn.cursor() as cur:
|
| 176 |
+
await cur.execute(
|
| 177 |
+
"""
|
| 178 |
+
SELECT source_id, text_content, metadata
|
| 179 |
+
FROM my_embeddings
|
| 180 |
+
WHERE source_id = %s
|
| 181 |
+
""",
|
| 182 |
+
(form_id,)
|
| 183 |
+
)
|
| 184 |
+
row = await cur.fetchone()
|
| 185 |
+
|
| 186 |
+
if not row:
|
| 187 |
+
return None
|
| 188 |
+
|
| 189 |
+
form_data = _parse_json_field(row[1])
|
| 190 |
+
metadata = _parse_json_field(row[2])
|
| 191 |
+
|
| 192 |
+
return SimilarityResult(
|
| 193 |
+
id=row[0],
|
| 194 |
+
form_data=form_data,
|
| 195 |
+
form_type=metadata.get("form_type", "unknown"),
|
| 196 |
+
score=1.0,
|
| 197 |
+
distance=0.0,
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
async def find_similar(
|
| 201 |
+
self,
|
| 202 |
+
query_embedding: np.ndarray,
|
| 203 |
+
form_type: Optional[str] = None,
|
| 204 |
+
limit: int = 10,
|
| 205 |
+
country_filter: Optional[str] = None,
|
| 206 |
+
exclude_ids: Optional[List[str]] = None
|
| 207 |
+
) -> List[SimilarityResult]:
|
| 208 |
+
"""Find similar donors/volunteers using vector similarity.
|
| 209 |
+
|
| 210 |
+
Uses L2 distance (Euclidean) with IVFFlat index for efficient search.
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
query_embedding: The query embedding vector.
|
| 214 |
+
form_type: Optional filter for "donor" or "volunteer".
|
| 215 |
+
limit: Maximum number of results to return.
|
| 216 |
+
country_filter: Optional filter for country code.
|
| 217 |
+
exclude_ids: Optional list of form IDs to exclude.
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
List of SimilarityResult ordered by similarity (highest first).
|
| 221 |
+
"""
|
| 222 |
+
embedding_list = query_embedding.tolist()
|
| 223 |
+
|
| 224 |
+
# Build query with optional filters
|
| 225 |
+
query = """
|
| 226 |
+
SELECT
|
| 227 |
+
source_id,
|
| 228 |
+
text_content,
|
| 229 |
+
metadata,
|
| 230 |
+
embedding <-> %s::vector AS distance
|
| 231 |
+
FROM my_embeddings
|
| 232 |
+
WHERE 1=1
|
| 233 |
+
"""
|
| 234 |
+
params: List[Any] = [embedding_list]
|
| 235 |
+
|
| 236 |
+
if form_type:
|
| 237 |
+
query += " AND metadata->>'form_type' = %s"
|
| 238 |
+
params.append(form_type)
|
| 239 |
+
|
| 240 |
+
if country_filter:
|
| 241 |
+
query += " AND text_content ILIKE %s"
|
| 242 |
+
params.append(f'%"country": "{country_filter}"%')
|
| 243 |
+
|
| 244 |
+
if exclude_ids:
|
| 245 |
+
placeholders = ", ".join(["%s"] * len(exclude_ids))
|
| 246 |
+
query += f" AND source_id NOT IN ({placeholders})"
|
| 247 |
+
params.extend(exclude_ids)
|
| 248 |
+
|
| 249 |
+
query += " ORDER BY distance ASC LIMIT %s"
|
| 250 |
+
params.append(limit)
|
| 251 |
+
|
| 252 |
+
async with self.pool.connection() as conn:
|
| 253 |
+
async with conn.cursor() as cur:
|
| 254 |
+
await cur.execute(query, params)
|
| 255 |
+
rows = await cur.fetchall()
|
| 256 |
+
|
| 257 |
+
results = []
|
| 258 |
+
for row in rows:
|
| 259 |
+
form_data = _parse_json_field(row[1])
|
| 260 |
+
metadata = _parse_json_field(row[2])
|
| 261 |
+
distance = float(row[3])
|
| 262 |
+
|
| 263 |
+
results.append(SimilarityResult(
|
| 264 |
+
id=row[0],
|
| 265 |
+
form_data=form_data,
|
| 266 |
+
form_type=metadata.get("form_type", "unknown"),
|
| 267 |
+
score=1.0 / (1.0 + distance), # Convert distance to similarity
|
| 268 |
+
distance=distance
|
| 269 |
+
))
|
| 270 |
+
|
| 271 |
+
return results
|
| 272 |
+
|
| 273 |
+
async def find_by_causes(
|
| 274 |
+
self,
|
| 275 |
+
target_causes: List[str],
|
| 276 |
+
query_embedding: np.ndarray,
|
| 277 |
+
limit: int = 20
|
| 278 |
+
) -> List[SimilarityResult]:
|
| 279 |
+
"""Hybrid search: filter by causes, rank by embedding similarity.
|
| 280 |
+
|
| 281 |
+
Combines keyword filtering with vector similarity for better
|
| 282 |
+
recommendations when specific causes are targeted.
|
| 283 |
+
|
| 284 |
+
Args:
|
| 285 |
+
target_causes: List of cause categories to match.
|
| 286 |
+
query_embedding: The query embedding for ranking.
|
| 287 |
+
limit: Maximum number of results to return.
|
| 288 |
+
|
| 289 |
+
Returns:
|
| 290 |
+
List of SimilarityResult matching causes, ranked by similarity.
|
| 291 |
+
"""
|
| 292 |
+
embedding_list = query_embedding.tolist()
|
| 293 |
+
|
| 294 |
+
# Build ILIKE clauses for cause filtering
|
| 295 |
+
cause_conditions = " OR ".join([
|
| 296 |
+
"text_content ILIKE %s" for _ in target_causes
|
| 297 |
+
])
|
| 298 |
+
cause_params = [f"%{cause}%" for cause in target_causes]
|
| 299 |
+
|
| 300 |
+
query = f"""
|
| 301 |
+
SELECT
|
| 302 |
+
source_id,
|
| 303 |
+
text_content,
|
| 304 |
+
metadata,
|
| 305 |
+
embedding <-> %s::vector AS distance
|
| 306 |
+
FROM my_embeddings
|
| 307 |
+
WHERE ({cause_conditions})
|
| 308 |
+
ORDER BY distance ASC
|
| 309 |
+
LIMIT %s
|
| 310 |
+
"""
|
| 311 |
+
|
| 312 |
+
params = [embedding_list] + cause_params + [limit]
|
| 313 |
+
|
| 314 |
+
async with self.pool.connection() as conn:
|
| 315 |
+
async with conn.cursor() as cur:
|
| 316 |
+
await cur.execute(query, params)
|
| 317 |
+
rows = await cur.fetchall()
|
| 318 |
+
|
| 319 |
+
results = []
|
| 320 |
+
for row in rows:
|
| 321 |
+
form_data = _parse_json_field(row[1])
|
| 322 |
+
metadata = _parse_json_field(row[2])
|
| 323 |
+
distance = float(row[3])
|
| 324 |
+
|
| 325 |
+
results.append(SimilarityResult(
|
| 326 |
+
id=row[0],
|
| 327 |
+
form_data=form_data,
|
| 328 |
+
form_type=metadata.get("form_type", "unknown"),
|
| 329 |
+
score=1.0 / (1.0 + distance),
|
| 330 |
+
distance=distance
|
| 331 |
+
))
|
| 332 |
+
|
| 333 |
+
return results
|
| 334 |
+
|
| 335 |
+
async def count_by_type(self) -> Dict[str, int]:
|
| 336 |
+
"""Get count of embeddings by form type.
|
| 337 |
+
|
| 338 |
+
Returns:
|
| 339 |
+
Dictionary with counts: {"donor": N, "volunteer": M, "total": N+M}
|
| 340 |
+
"""
|
| 341 |
+
async with self.pool.connection() as conn:
|
| 342 |
+
async with conn.cursor() as cur:
|
| 343 |
+
await cur.execute("""
|
| 344 |
+
SELECT
|
| 345 |
+
metadata->>'form_type' as form_type,
|
| 346 |
+
COUNT(*) as count
|
| 347 |
+
FROM my_embeddings
|
| 348 |
+
GROUP BY metadata->>'form_type'
|
| 349 |
+
""")
|
| 350 |
+
rows = await cur.fetchall()
|
| 351 |
+
|
| 352 |
+
counts = {"donor": 0, "volunteer": 0, "total": 0}
|
| 353 |
+
for row in rows:
|
| 354 |
+
form_type = row[0] or "unknown"
|
| 355 |
+
count = row[1]
|
| 356 |
+
if form_type in counts:
|
| 357 |
+
counts[form_type] = count
|
| 358 |
+
counts["total"] += count
|
| 359 |
+
|
| 360 |
+
return counts
|
| 361 |
+
|
| 362 |
+
async def find_by_form_type(
|
| 363 |
+
self, form_type: str, limit: int = 500
|
| 364 |
+
) -> List[SimilarityResult]:
|
| 365 |
+
"""Get all entries of a specific form type.
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
form_type: Type of form ("donor", "volunteer", or "client").
|
| 369 |
+
limit: Maximum number of results to return.
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
List of SimilarityResult for the specified form type.
|
| 373 |
+
"""
|
| 374 |
+
query = """
|
| 375 |
+
SELECT
|
| 376 |
+
source_id,
|
| 377 |
+
text_content,
|
| 378 |
+
metadata
|
| 379 |
+
FROM my_embeddings
|
| 380 |
+
WHERE metadata->>'form_type' = %s
|
| 381 |
+
LIMIT %s
|
| 382 |
+
"""
|
| 383 |
+
|
| 384 |
+
async with self.pool.connection() as conn:
|
| 385 |
+
async with conn.cursor() as cur:
|
| 386 |
+
await cur.execute(query, (form_type, limit))
|
| 387 |
+
rows = await cur.fetchall()
|
| 388 |
+
|
| 389 |
+
results = []
|
| 390 |
+
for row in rows:
|
| 391 |
+
form_data = _parse_json_field(row[1])
|
| 392 |
+
metadata = _parse_json_field(row[2])
|
| 393 |
+
|
| 394 |
+
results.append(
|
| 395 |
+
SimilarityResult(
|
| 396 |
+
id=row[0],
|
| 397 |
+
form_data=form_data,
|
| 398 |
+
form_type=metadata.get("form_type", form_type),
|
| 399 |
+
score=1.0,
|
| 400 |
+
distance=0.0,
|
| 401 |
+
)
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
return results
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.109.0
|
| 2 |
+
uvicorn>=0.27.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
python-dotenv
|
| 5 |
+
|
| 6 |
+
# LangGraph and LangChain
|
| 7 |
+
langchain
|
| 8 |
+
langchain-core
|
| 9 |
+
langchain-ollama
|
| 10 |
+
langgraph
|
| 11 |
+
langgraph-checkpoint-postgres
|
| 12 |
+
|
| 13 |
+
# Database
|
| 14 |
+
psycopg[binary,pool]>=3.1.0
|
| 15 |
+
|
| 16 |
+
# SeaLion encoder
|
| 17 |
+
httpx>=0.24.0
|
| 18 |
+
numpy>=1.24.0
|
| 19 |
+
|
| 20 |
+
# OpenAI (for charity web search tools)
|
| 21 |
+
openai>=1.0.0
|
test_agentic_rag.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test script for Agentic RAG functionality."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
# Add project root to path
|
| 8 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
try:
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
load_dotenv()
|
| 14 |
+
except ImportError:
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
async def test_rag_tools():
|
| 19 |
+
"""Test individual RAG tools."""
|
| 20 |
+
print("\n" + "=" * 60)
|
| 21 |
+
print("Testing Agentic RAG Tools")
|
| 22 |
+
print("=" * 60)
|
| 23 |
+
|
| 24 |
+
from tools.rag_tools import (
|
| 25 |
+
RAG_TOOLS,
|
| 26 |
+
set_rag_dependencies,
|
| 27 |
+
list_available_categories,
|
| 28 |
+
get_statistics,
|
| 29 |
+
semantic_search,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Check available tools
|
| 33 |
+
print("\n📦 Available RAG Tools:")
|
| 34 |
+
for tool in RAG_TOOLS:
|
| 35 |
+
print(f" - {tool.name}: {tool.description[:60]}...")
|
| 36 |
+
|
| 37 |
+
# Initialize dependencies
|
| 38 |
+
print("\n🔧 Initializing dependencies...")
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
from encoders.sealion import SeaLionEncoder
|
| 42 |
+
from recommender.vector_store import DonorVectorStore
|
| 43 |
+
from psycopg_pool import AsyncConnectionPool
|
| 44 |
+
|
| 45 |
+
# Check for required env vars
|
| 46 |
+
sealion_endpoint = os.getenv("SEALION_ENDPOINT")
|
| 47 |
+
db_host = os.getenv("SUPABASE_DB_HOST")
|
| 48 |
+
|
| 49 |
+
if not sealion_endpoint:
|
| 50 |
+
print(" ⚠️ SEALION_ENDPOINT not set, skipping live tests")
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
if not db_host:
|
| 54 |
+
print(" ⚠️ Database credentials not set, skipping live tests")
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
# Initialize encoder
|
| 58 |
+
encoder = SeaLionEncoder(endpoint_url=sealion_endpoint)
|
| 59 |
+
print(f" ✅ SeaLion encoder initialized (dim: {encoder.embedding_dimension})")
|
| 60 |
+
|
| 61 |
+
# Initialize database pool
|
| 62 |
+
db_port = os.getenv("SUPABASE_DB_PORT", "6543")
|
| 63 |
+
db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
|
| 64 |
+
db_user = os.getenv("SUPABASE_DB_USER")
|
| 65 |
+
db_password = os.getenv("SUPABASE_DB_PASSWORD")
|
| 66 |
+
db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
|
| 67 |
+
|
| 68 |
+
conn_string = (
|
| 69 |
+
f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
|
| 70 |
+
f"?sslmode={db_sslmode}"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
pool = AsyncConnectionPool(
|
| 74 |
+
conninfo=conn_string,
|
| 75 |
+
max_size=5,
|
| 76 |
+
kwargs={"autocommit": True, "prepare_threshold": None},
|
| 77 |
+
)
|
| 78 |
+
await pool.open()
|
| 79 |
+
print(" ✅ Database pool connected")
|
| 80 |
+
|
| 81 |
+
vector_store = DonorVectorStore(pool)
|
| 82 |
+
print(" ✅ Vector store initialized")
|
| 83 |
+
|
| 84 |
+
# Set dependencies for tools
|
| 85 |
+
set_rag_dependencies(encoder, vector_store)
|
| 86 |
+
print(" ✅ RAG tools configured")
|
| 87 |
+
|
| 88 |
+
# Test list_available_categories
|
| 89 |
+
print("\n📊 Testing list_available_categories...")
|
| 90 |
+
categories_result = await list_available_categories.ainvoke({})
|
| 91 |
+
print(f" Result: {categories_result[:200]}...")
|
| 92 |
+
|
| 93 |
+
# Test get_statistics
|
| 94 |
+
print("\n📈 Testing get_statistics...")
|
| 95 |
+
stats_result = await get_statistics.ainvoke({})
|
| 96 |
+
print(f" Result: {stats_result}")
|
| 97 |
+
|
| 98 |
+
# Test semantic_search (if there's data)
|
| 99 |
+
print("\n🔍 Testing semantic_search...")
|
| 100 |
+
search_result = await semantic_search.ainvoke({
|
| 101 |
+
"query": "education donors in Singapore",
|
| 102 |
+
"limit": 3
|
| 103 |
+
})
|
| 104 |
+
print(f" Result: {search_result[:300]}...")
|
| 105 |
+
|
| 106 |
+
# Cleanup
|
| 107 |
+
await pool.close()
|
| 108 |
+
print("\n✅ All tool tests completed!")
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
import traceback
|
| 112 |
+
print(f" ❌ Error: {e}")
|
| 113 |
+
traceback.print_exc()
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
async def test_agentic_rag_agent():
|
| 117 |
+
"""Test the full Agentic RAG agent."""
|
| 118 |
+
print("\n" + "=" * 60)
|
| 119 |
+
print("Testing Agentic RAG Agent")
|
| 120 |
+
print("=" * 60)
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
from agents.agentic_rag import AgenticRAGAgent
|
| 124 |
+
from encoders.sealion import SeaLionEncoder
|
| 125 |
+
from recommender.vector_store import DonorVectorStore
|
| 126 |
+
from psycopg_pool import AsyncConnectionPool
|
| 127 |
+
from langchain_ollama import ChatOllama
|
| 128 |
+
|
| 129 |
+
# Check for required env vars
|
| 130 |
+
sealion_endpoint = os.getenv("SEALION_ENDPOINT")
|
| 131 |
+
db_host = os.getenv("SUPABASE_DB_HOST")
|
| 132 |
+
ollama_api_key = os.getenv("OLLAMA_API_KEY")
|
| 133 |
+
|
| 134 |
+
if not all([sealion_endpoint, db_host]):
|
| 135 |
+
print(" ⚠️ Required environment variables not set, skipping agent test")
|
| 136 |
+
return
|
| 137 |
+
|
| 138 |
+
print("\n🔧 Initializing agent components...")
|
| 139 |
+
|
| 140 |
+
# Initialize encoder
|
| 141 |
+
encoder = SeaLionEncoder(endpoint_url=sealion_endpoint)
|
| 142 |
+
|
| 143 |
+
# Initialize database
|
| 144 |
+
db_port = os.getenv("SUPABASE_DB_PORT", "6543")
|
| 145 |
+
db_name = os.getenv("SUPABASE_DB_NAME", "postgres")
|
| 146 |
+
db_user = os.getenv("SUPABASE_DB_USER")
|
| 147 |
+
db_password = os.getenv("SUPABASE_DB_PASSWORD")
|
| 148 |
+
db_sslmode = os.getenv("SUPABASE_DB_SSLMODE", "require")
|
| 149 |
+
|
| 150 |
+
conn_string = (
|
| 151 |
+
f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
|
| 152 |
+
f"?sslmode={db_sslmode}"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
pool = AsyncConnectionPool(
|
| 156 |
+
conninfo=conn_string,
|
| 157 |
+
max_size=5,
|
| 158 |
+
kwargs={"autocommit": True, "prepare_threshold": None},
|
| 159 |
+
)
|
| 160 |
+
await pool.open()
|
| 161 |
+
|
| 162 |
+
vector_store = DonorVectorStore(pool)
|
| 163 |
+
|
| 164 |
+
# Initialize LLM
|
| 165 |
+
if ollama_api_key:
|
| 166 |
+
llm = ChatOllama(
|
| 167 |
+
model="gpt-oss:120b",
|
| 168 |
+
base_url="https://ollama.com",
|
| 169 |
+
client_kwargs={
|
| 170 |
+
"headers": {"Authorization": f"Bearer {ollama_api_key}"}
|
| 171 |
+
}
|
| 172 |
+
)
|
| 173 |
+
else:
|
| 174 |
+
llm = ChatOllama(model="gpt-oss:120b-cloud")
|
| 175 |
+
|
| 176 |
+
print(" ✅ All components initialized")
|
| 177 |
+
|
| 178 |
+
# Create agent
|
| 179 |
+
agent = AgenticRAGAgent(llm, encoder, vector_store)
|
| 180 |
+
print(" ✅ Agentic RAG agent created")
|
| 181 |
+
|
| 182 |
+
# Test a query
|
| 183 |
+
print("\n🤖 Running agent query: 'Find donors interested in education'")
|
| 184 |
+
print("-" * 40)
|
| 185 |
+
|
| 186 |
+
result = await agent.search("Find donors interested in education in Singapore")
|
| 187 |
+
|
| 188 |
+
print(f"\n📝 Response:\n{result['response'][:500]}...")
|
| 189 |
+
print(f"\n🔧 Tool calls made: {len(result['tool_calls'])}")
|
| 190 |
+
for tc in result['tool_calls']:
|
| 191 |
+
print(f" - {tc['tool']}: {tc['args']}")
|
| 192 |
+
print(f"\n📊 Total messages: {result['message_count']}")
|
| 193 |
+
|
| 194 |
+
# Cleanup
|
| 195 |
+
await pool.close()
|
| 196 |
+
print("\n✅ Agent test completed!")
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
import traceback
|
| 200 |
+
print(f" ❌ Error: {e}")
|
| 201 |
+
traceback.print_exc()
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
async def main():
|
| 205 |
+
"""Run all tests."""
|
| 206 |
+
print("\n🚀 Agentic RAG Test Suite")
|
| 207 |
+
print("=" * 60)
|
| 208 |
+
|
| 209 |
+
await test_rag_tools()
|
| 210 |
+
await test_agentic_rag_agent()
|
| 211 |
+
|
| 212 |
+
print("\n" + "=" * 60)
|
| 213 |
+
print("All tests completed!")
|
| 214 |
+
print("=" * 60)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
if __name__ == "__main__":
|
| 218 |
+
# Windows async fix
|
| 219 |
+
if sys.platform == "win32":
|
| 220 |
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
| 221 |
+
|
| 222 |
+
asyncio.run(main())
|
test_api.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test the lookalike API endpoint."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
from app import find_lookalike_clients, LookalikeRequest
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
async def test_lookalike_endpoint():
|
| 8 |
+
"""Test the lookalike endpoint with hybrid matching."""
|
| 9 |
+
print("=" * 60)
|
| 10 |
+
print("Testing Lookalike API Endpoint")
|
| 11 |
+
print("=" * 60)
|
| 12 |
+
|
| 13 |
+
# Test 1: Basic request
|
| 14 |
+
print("\nTest 1: Basic request with education cause")
|
| 15 |
+
req = LookalikeRequest(
|
| 16 |
+
seed_causes=["education"],
|
| 17 |
+
seed_interests=["sustainability"],
|
| 18 |
+
limit=15,
|
| 19 |
+
include_geojson=True,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
result = await find_lookalike_clients(req)
|
| 23 |
+
|
| 24 |
+
print(f"Total found: {result.total_found}")
|
| 25 |
+
print(
|
| 26 |
+
f"Tiers: T1={len(result.tiers['tier_1'])}, T2={len(result.tiers['tier_2'])}, T3={len(result.tiers['tier_3'])}"
|
| 27 |
+
)
|
| 28 |
+
print(
|
| 29 |
+
f"GeoJSON features: {len(result.geojson['features']) if result.geojson else 0}"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
if result.tiers["tier_1"]:
|
| 33 |
+
top = result.tiers["tier_1"][0]
|
| 34 |
+
print(f"\nTop match: {top.user_id}")
|
| 35 |
+
print(f" Score: {top.final_score:.3f}")
|
| 36 |
+
print(f" Causes: {top.causes}")
|
| 37 |
+
print(f" Area: {top.planning_area}")
|
| 38 |
+
|
| 39 |
+
# Test 2: With planning area filter
|
| 40 |
+
print("\n" + "-" * 60)
|
| 41 |
+
print("Test 2: With planning area filter (bishan)")
|
| 42 |
+
req2 = LookalikeRequest(
|
| 43 |
+
seed_causes=["education", "children"],
|
| 44 |
+
seed_interests=["community"],
|
| 45 |
+
planning_area_filter="bishan",
|
| 46 |
+
limit=10,
|
| 47 |
+
include_geojson=False,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
result2 = await find_lookalike_clients(req2)
|
| 51 |
+
print(f"Total found in Bishan: {result2.total_found}")
|
| 52 |
+
|
| 53 |
+
# Test 3: With housing type filter
|
| 54 |
+
print("\n" + "-" * 60)
|
| 55 |
+
print("Test 3: With housing type filter (condo, landed)")
|
| 56 |
+
req3 = LookalikeRequest(
|
| 57 |
+
seed_causes=["environment"],
|
| 58 |
+
seed_interests=["technology"],
|
| 59 |
+
housing_type_filter=["condo", "landed"],
|
| 60 |
+
limit=10,
|
| 61 |
+
include_geojson=False,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
result3 = await find_lookalike_clients(req3)
|
| 65 |
+
print(f"Total found (high-income housing): {result3.total_found}")
|
| 66 |
+
|
| 67 |
+
for client in result3.tiers["tier_1"][:3]:
|
| 68 |
+
print(
|
| 69 |
+
f" - {client.user_id}: {client.housing_type}, score={client.final_score:.3f}"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Test 4: Low minimum score to get all matches
|
| 73 |
+
print("\n" + "-" * 60)
|
| 74 |
+
print("Test 4: Relaxed min_score (0.0)")
|
| 75 |
+
req4 = LookalikeRequest(
|
| 76 |
+
seed_causes=["health"],
|
| 77 |
+
seed_interests=[],
|
| 78 |
+
min_score=0.0,
|
| 79 |
+
limit=30,
|
| 80 |
+
include_geojson=True,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
result4 = await find_lookalike_clients(req4)
|
| 84 |
+
print(f"Total found: {result4.total_found}")
|
| 85 |
+
print(
|
| 86 |
+
f"Score range: {min(c.final_score for t in result4.tiers.values() for c in t):.3f} - {max(c.final_score for t in result4.tiers.values() for c in t):.3f}"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
print("\n" + "=" * 60)
|
| 90 |
+
print("All API tests passed!")
|
| 91 |
+
print("=" * 60)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
asyncio.run(test_lookalike_endpoint())
|
test_gis.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test the GIS recommender with dimensionality reduction."""
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
from recommender.gis_recommender import (
|
| 5 |
+
EmbeddingReducer,
|
| 6 |
+
HybridSemanticSpatialEncoder,
|
| 7 |
+
generate_mock_clients,
|
| 8 |
+
generate_seed_donor_profile,
|
| 9 |
+
GISRecommender,
|
| 10 |
+
ClientProfile,
|
| 11 |
+
HousingType,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_embedding_reducer():
|
| 16 |
+
"""Test the embedding reducer."""
|
| 17 |
+
print("=" * 50)
|
| 18 |
+
print("Testing EmbeddingReducer")
|
| 19 |
+
print("=" * 50)
|
| 20 |
+
|
| 21 |
+
# Create sample sparse embeddings (like SEA-LION output)
|
| 22 |
+
sample_embedding = np.zeros(1024)
|
| 23 |
+
sample_embedding[0] = 0.206
|
| 24 |
+
sample_embedding[1] = -0.198
|
| 25 |
+
sample_embedding[10] = 0.178
|
| 26 |
+
sample_embedding[50] = -0.145
|
| 27 |
+
sample_embedding[100] = 0.234
|
| 28 |
+
sample_embedding[200] = -0.167
|
| 29 |
+
sample_embedding[500] = 0.189
|
| 30 |
+
sample_embedding[800] = -0.156
|
| 31 |
+
|
| 32 |
+
# Test sparse projection
|
| 33 |
+
reduced = EmbeddingReducer.compute_sparse_projection(
|
| 34 |
+
sample_embedding, n_components=8
|
| 35 |
+
)
|
| 36 |
+
print(f"Original dims: {len(sample_embedding)}")
|
| 37 |
+
print(f"Reduced dims: {len(reduced)}")
|
| 38 |
+
print(f"Reduced values: {reduced}")
|
| 39 |
+
print(f"Reduced norm: {np.linalg.norm(reduced):.4f}")
|
| 40 |
+
print()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_hybrid_encoder():
|
| 44 |
+
"""Test the hybrid semantic-spatial encoder."""
|
| 45 |
+
print("=" * 50)
|
| 46 |
+
print("Testing HybridSemanticSpatialEncoder")
|
| 47 |
+
print("=" * 50)
|
| 48 |
+
|
| 49 |
+
encoder = HybridSemanticSpatialEncoder(semantic_dims=8)
|
| 50 |
+
|
| 51 |
+
# Test with sample embedding and Singapore coordinates
|
| 52 |
+
embedding = np.random.randn(1024)
|
| 53 |
+
coords = (1.3521, 103.8198) # Singapore center
|
| 54 |
+
|
| 55 |
+
hybrid = encoder.encode(embedding, coords)
|
| 56 |
+
print(f"Hybrid vector dims: {len(hybrid)}") # Should be 8 + 2 = 10
|
| 57 |
+
print(f"Hybrid values: {hybrid}")
|
| 58 |
+
|
| 59 |
+
# Test similarity between nearby points
|
| 60 |
+
coords2 = (1.3525, 103.8195) # Very close
|
| 61 |
+
hybrid2 = encoder.encode(embedding, coords2)
|
| 62 |
+
|
| 63 |
+
sim_same = encoder.compute_similarity(hybrid, hybrid2)
|
| 64 |
+
print(f"Similarity (same embedding, close coords): {sim_same:.4f}")
|
| 65 |
+
|
| 66 |
+
# Test with different embedding
|
| 67 |
+
embedding3 = np.random.randn(1024)
|
| 68 |
+
hybrid3 = encoder.encode(embedding3, coords)
|
| 69 |
+
|
| 70 |
+
sim_diff = encoder.compute_similarity(hybrid, hybrid3)
|
| 71 |
+
print(f"Similarity (diff embedding, same coords): {sim_diff:.4f}")
|
| 72 |
+
print()
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_mock_clients():
|
| 76 |
+
"""Test mock client generation with embeddings."""
|
| 77 |
+
print("=" * 50)
|
| 78 |
+
print("Testing Mock Client Generation")
|
| 79 |
+
print("=" * 50)
|
| 80 |
+
|
| 81 |
+
seed = generate_seed_donor_profile("education")
|
| 82 |
+
print(f"Seed profile: {seed.user_id}")
|
| 83 |
+
print(f" - Causes: {seed.causes}")
|
| 84 |
+
print(f" - Full embedding dims: {len(seed.embedding)}")
|
| 85 |
+
print(f" - Reduced embedding dims: {len(seed.embedding_reduced)}")
|
| 86 |
+
print(f" - Hybrid embedding dims: {len(seed.hybrid_embedding)}")
|
| 87 |
+
print()
|
| 88 |
+
|
| 89 |
+
clients = generate_mock_clients(10)
|
| 90 |
+
print(f"Generated {len(clients)} mock clients")
|
| 91 |
+
for i, c in enumerate(clients[:3]):
|
| 92 |
+
print(f" Client {i}: {c.user_id}")
|
| 93 |
+
print(f" - Area: {c.planning_area}, Housing: {c.housing_type.value}")
|
| 94 |
+
print(f" - Causes: {c.causes}")
|
| 95 |
+
print(
|
| 96 |
+
f" - Has embeddings: full={c.embedding is not None}, reduced={c.embedding_reduced is not None}"
|
| 97 |
+
)
|
| 98 |
+
print()
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def test_hybrid_lookalike():
|
| 102 |
+
"""Test hybrid lookalike matching."""
|
| 103 |
+
print("=" * 50)
|
| 104 |
+
print("Testing Hybrid Lookalike Matching")
|
| 105 |
+
print("=" * 50)
|
| 106 |
+
|
| 107 |
+
seed = generate_seed_donor_profile("education")
|
| 108 |
+
candidates = generate_mock_clients(50)
|
| 109 |
+
|
| 110 |
+
recommender = GISRecommender()
|
| 111 |
+
|
| 112 |
+
# Find lookalikes without filters
|
| 113 |
+
results = recommender.find_lookalikes_hybrid(
|
| 114 |
+
seed_profile=seed,
|
| 115 |
+
candidates=candidates,
|
| 116 |
+
k=10,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
print(f"Found {len(results)} lookalikes")
|
| 120 |
+
print("\nTop 5 matches:")
|
| 121 |
+
for i, r in enumerate(results[:5]):
|
| 122 |
+
print(f" {i+1}. {r.client.user_id}")
|
| 123 |
+
print(
|
| 124 |
+
f" Score: {r.final_score:.3f} (vector={r.vector_similarity_score:.3f}, spatial={r.spatial_proxy_score:.3f}, prox={r.proximity_score:.3f})"
|
| 125 |
+
)
|
| 126 |
+
print(f" Causes: {r.client.causes}")
|
| 127 |
+
print(f" Distance: {r.geo_distance_km:.2f} km")
|
| 128 |
+
print()
|
| 129 |
+
|
| 130 |
+
# Test with planning area filter
|
| 131 |
+
print("\nWith planning area filter (bishan):")
|
| 132 |
+
results_filtered = recommender.find_lookalikes_hybrid(
|
| 133 |
+
seed_profile=seed,
|
| 134 |
+
candidates=candidates,
|
| 135 |
+
k=10,
|
| 136 |
+
planning_area_filter="bishan",
|
| 137 |
+
)
|
| 138 |
+
print(f"Found {len(results_filtered)} matches in Bishan")
|
| 139 |
+
for r in results_filtered[:3]:
|
| 140 |
+
print(f" - {r.client.user_id}: {r.final_score:.3f}")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def test_tiered_targeting():
|
| 144 |
+
"""Test tiered targeting."""
|
| 145 |
+
print("=" * 50)
|
| 146 |
+
print("Testing Tiered Targeting")
|
| 147 |
+
print("=" * 50)
|
| 148 |
+
|
| 149 |
+
seed = generate_seed_donor_profile("education")
|
| 150 |
+
candidates = generate_mock_clients(100)
|
| 151 |
+
|
| 152 |
+
recommender = GISRecommender()
|
| 153 |
+
|
| 154 |
+
results = recommender.find_lookalikes_hybrid(
|
| 155 |
+
seed_profile=seed,
|
| 156 |
+
candidates=candidates,
|
| 157 |
+
k=30,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
tiered = recommender.apply_tiered_targeting(results, min_score=0.0)
|
| 161 |
+
|
| 162 |
+
print(f"Tier 1 (High Priority): {len(tiered['tier_1'])} clients")
|
| 163 |
+
print(f"Tier 2 (Medium Priority): {len(tiered['tier_2'])} clients")
|
| 164 |
+
print(f"Tier 3 (Lower Priority): {len(tiered['tier_3'])} clients")
|
| 165 |
+
|
| 166 |
+
if tiered["tier_1"]:
|
| 167 |
+
print(
|
| 168 |
+
f"\nTier 1 score range: {tiered['tier_1'][-1].final_score:.3f} - {tiered['tier_1'][0].final_score:.3f}"
|
| 169 |
+
)
|
| 170 |
+
if tiered["tier_3"]:
|
| 171 |
+
print(
|
| 172 |
+
f"Tier 3 score range: {tiered['tier_3'][-1].final_score:.3f} - {tiered['tier_3'][0].final_score:.3f}"
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def test_geojson_export():
|
| 177 |
+
"""Test GeoJSON export."""
|
| 178 |
+
print("=" * 50)
|
| 179 |
+
print("Testing GeoJSON Export")
|
| 180 |
+
print("=" * 50)
|
| 181 |
+
|
| 182 |
+
seed = generate_seed_donor_profile("education")
|
| 183 |
+
candidates = generate_mock_clients(20)
|
| 184 |
+
|
| 185 |
+
recommender = GISRecommender()
|
| 186 |
+
results = recommender.find_lookalikes_hybrid(seed, candidates, k=10)
|
| 187 |
+
|
| 188 |
+
geojson = recommender.to_geojson(results)
|
| 189 |
+
|
| 190 |
+
print(f"GeoJSON type: {geojson['type']}")
|
| 191 |
+
print(f"Number of features: {len(geojson['features'])}")
|
| 192 |
+
|
| 193 |
+
if geojson["features"]:
|
| 194 |
+
feat = geojson["features"][0]
|
| 195 |
+
print(f"\nSample feature:")
|
| 196 |
+
print(
|
| 197 |
+
f" Geometry: {feat['geometry']['type']} at {feat['geometry']['coordinates']}"
|
| 198 |
+
)
|
| 199 |
+
print(f" Properties: {list(feat['properties'].keys())}")
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
test_embedding_reducer()
|
| 204 |
+
test_hybrid_encoder()
|
| 205 |
+
test_mock_clients()
|
| 206 |
+
test_hybrid_lookalike()
|
| 207 |
+
test_tiered_targeting()
|
| 208 |
+
test_geojson_export()
|
| 209 |
+
|
| 210 |
+
print("\n" + "=" * 50)
|
| 211 |
+
print("All tests passed!")
|
| 212 |
+
print("=" * 50)
|
tools/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tools for LangGraph agents."""
|
| 2 |
+
|
| 3 |
+
from .web_search import (
|
| 4 |
+
search_charity_info,
|
| 5 |
+
search_charity_ratings,
|
| 6 |
+
search_charity_comprehensive,
|
| 7 |
+
CHARITY_SEARCH_TOOLS,
|
| 8 |
+
openai_web_search,
|
| 9 |
+
clear_search_cache,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
from .rag_tools import (
|
| 13 |
+
semantic_search,
|
| 14 |
+
filter_by_metadata,
|
| 15 |
+
get_document_by_id,
|
| 16 |
+
list_available_categories,
|
| 17 |
+
hybrid_search,
|
| 18 |
+
get_statistics,
|
| 19 |
+
RAG_TOOLS,
|
| 20 |
+
set_rag_dependencies,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
__all__ = [
|
| 24 |
+
# Web search tools
|
| 25 |
+
"search_charity_info",
|
| 26 |
+
"search_charity_ratings",
|
| 27 |
+
"search_charity_comprehensive",
|
| 28 |
+
"CHARITY_SEARCH_TOOLS",
|
| 29 |
+
"openai_web_search",
|
| 30 |
+
"clear_search_cache",
|
| 31 |
+
# RAG tools
|
| 32 |
+
"semantic_search",
|
| 33 |
+
"filter_by_metadata",
|
| 34 |
+
"get_document_by_id",
|
| 35 |
+
"list_available_categories",
|
| 36 |
+
"hybrid_search",
|
| 37 |
+
"get_statistics",
|
| 38 |
+
"RAG_TOOLS",
|
| 39 |
+
"set_rag_dependencies",
|
| 40 |
+
]
|
tools/rag_tools.py
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agentic RAG tools for autonomous vector store exploration.
|
| 2 |
+
|
| 3 |
+
This module provides tools that allow an agent to autonomously:
|
| 4 |
+
1. Search semantically across the vector store
|
| 5 |
+
2. Filter by metadata fields
|
| 6 |
+
3. Retrieve specific documents
|
| 7 |
+
4. List available categories
|
| 8 |
+
5. Perform hybrid search with filters
|
| 9 |
+
|
| 10 |
+
The agent uses a ReAct loop to iteratively explore and refine its search.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
from typing import Optional, Dict, Any, List
|
| 15 |
+
from langchain_core.tools import tool
|
| 16 |
+
from functools import wraps
|
| 17 |
+
|
| 18 |
+
# Global references to be set at initialization
|
| 19 |
+
_encoder = None
|
| 20 |
+
_vector_store = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def set_rag_dependencies(encoder, vector_store):
|
| 24 |
+
"""Set the encoder and vector store instances for RAG tools.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
encoder: The SeaLion encoder instance
|
| 28 |
+
vector_store: The DonorVectorStore instance
|
| 29 |
+
"""
|
| 30 |
+
global _encoder, _vector_store
|
| 31 |
+
_encoder = encoder
|
| 32 |
+
_vector_store = vector_store
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _format_results(results: List[Any], include_details: bool = True) -> str:
|
| 36 |
+
"""Format search results for agent consumption.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
results: List of SimilarityResult objects
|
| 40 |
+
include_details: Whether to include full form data
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Formatted string representation of results
|
| 44 |
+
"""
|
| 45 |
+
if not results:
|
| 46 |
+
return "No results found."
|
| 47 |
+
|
| 48 |
+
formatted = []
|
| 49 |
+
for i, result in enumerate(results, 1):
|
| 50 |
+
entry = {
|
| 51 |
+
"rank": i,
|
| 52 |
+
"id": result.id,
|
| 53 |
+
"form_type": result.form_type,
|
| 54 |
+
"similarity_score": round(result.score, 4),
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
if include_details and result.form_data:
|
| 58 |
+
# Extract key fields for readability
|
| 59 |
+
form_data = result.form_data
|
| 60 |
+
entry["name"] = form_data.get("name", "Unknown")
|
| 61 |
+
entry["country"] = form_data.get("country", "Unknown")
|
| 62 |
+
entry["causes"] = form_data.get("causes", [])
|
| 63 |
+
|
| 64 |
+
# Include type-specific fields
|
| 65 |
+
if result.form_type == "donor":
|
| 66 |
+
entry["donor_type"] = form_data.get("donor_type", "Unknown")
|
| 67 |
+
entry["donation_frequency"] = form_data.get("donation_frequency")
|
| 68 |
+
elif result.form_type == "volunteer":
|
| 69 |
+
entry["volunteer_type"] = form_data.get("volunteer_type", "Unknown")
|
| 70 |
+
entry["skills"] = form_data.get("skills", [])
|
| 71 |
+
entry["availability"] = form_data.get("availability")
|
| 72 |
+
|
| 73 |
+
formatted.append(entry)
|
| 74 |
+
|
| 75 |
+
return json.dumps(formatted, indent=2, default=str)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@tool
|
| 79 |
+
async def semantic_search(query: str, limit: int = 5, form_type: Optional[str] = None) -> str:
|
| 80 |
+
"""Search documents by semantic similarity.
|
| 81 |
+
|
| 82 |
+
Use this to find donors/volunteers whose profiles match a natural language query.
|
| 83 |
+
The search uses vector embeddings to find semantically similar entries.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
query: Natural language description of what you're looking for.
|
| 87 |
+
Examples: "corporate donors interested in education",
|
| 88 |
+
"volunteers with tech skills in Singapore"
|
| 89 |
+
limit: Maximum number of results to return (default: 5, max: 20)
|
| 90 |
+
form_type: Optional filter - "donor" or "volunteer"
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
JSON formatted list of matching profiles with similarity scores
|
| 94 |
+
"""
|
| 95 |
+
print(f"[Agentic RAG] semantic_search called - query: '{query}', limit: {limit}, form_type: {form_type}")
|
| 96 |
+
if _encoder is None or _vector_store is None:
|
| 97 |
+
return "Error: RAG tools not initialized. Call set_rag_dependencies first."
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
# Encode the query
|
| 101 |
+
embedding = await _encoder.encode(query)
|
| 102 |
+
|
| 103 |
+
# Search the vector store
|
| 104 |
+
results = await _vector_store.find_similar(
|
| 105 |
+
query_embedding=embedding,
|
| 106 |
+
form_type=form_type,
|
| 107 |
+
limit=min(limit, 20)
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
return _format_results(results)
|
| 111 |
+
except Exception as e:
|
| 112 |
+
return f"Search error: {str(e)}"
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@tool
|
| 116 |
+
async def filter_by_metadata(
|
| 117 |
+
field: str,
|
| 118 |
+
value: str,
|
| 119 |
+
limit: int = 10
|
| 120 |
+
) -> str:
|
| 121 |
+
"""Browse documents filtered by metadata field.
|
| 122 |
+
|
| 123 |
+
Use this to find all entries matching a specific metadata value.
|
| 124 |
+
Useful for exploring what's available before doing semantic search.
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
field: The metadata field to filter on.
|
| 128 |
+
Valid fields: "form_type", "donor_type", "volunteer_type",
|
| 129 |
+
"country", "availability"
|
| 130 |
+
value: The value to match.
|
| 131 |
+
Examples: form_type="donor", country="SG", donor_type="corporate"
|
| 132 |
+
limit: Maximum number of results (default: 10)
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
JSON formatted list of matching entries
|
| 136 |
+
"""
|
| 137 |
+
print(f"[Agentic RAG] filter_by_metadata called - field: '{field}', value: '{value}', limit: {limit}")
|
| 138 |
+
if _vector_store is None:
|
| 139 |
+
return "Error: RAG tools not initialized."
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
# Map field to actual database query approach
|
| 143 |
+
if field == "form_type":
|
| 144 |
+
results = await _vector_store.find_by_form_type(value, limit=limit)
|
| 145 |
+
else:
|
| 146 |
+
# For other fields, we need to search through text_content
|
| 147 |
+
# Use a raw query approach
|
| 148 |
+
async with _vector_store.pool.connection() as conn:
|
| 149 |
+
async with conn.cursor() as cur:
|
| 150 |
+
# Build ILIKE pattern for JSON field search
|
| 151 |
+
pattern = f'%"{field}": "{value}"%'
|
| 152 |
+
|
| 153 |
+
await cur.execute(
|
| 154 |
+
"""
|
| 155 |
+
SELECT source_id, text_content, metadata
|
| 156 |
+
FROM my_embeddings
|
| 157 |
+
WHERE text_content ILIKE %s
|
| 158 |
+
LIMIT %s
|
| 159 |
+
""",
|
| 160 |
+
(pattern, limit)
|
| 161 |
+
)
|
| 162 |
+
rows = await cur.fetchall()
|
| 163 |
+
|
| 164 |
+
# Convert to SimilarityResult-like format
|
| 165 |
+
from recommender.vector_store import SimilarityResult, _parse_json_field
|
| 166 |
+
results = []
|
| 167 |
+
for row in rows:
|
| 168 |
+
form_data = _parse_json_field(row[1])
|
| 169 |
+
metadata = _parse_json_field(row[2])
|
| 170 |
+
results.append(SimilarityResult(
|
| 171 |
+
id=row[0],
|
| 172 |
+
form_data=form_data,
|
| 173 |
+
form_type=metadata.get("form_type", "unknown"),
|
| 174 |
+
score=1.0,
|
| 175 |
+
distance=0.0
|
| 176 |
+
))
|
| 177 |
+
|
| 178 |
+
return _format_results(results)
|
| 179 |
+
except Exception as e:
|
| 180 |
+
return f"Filter error: {str(e)}"
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
@tool
|
| 184 |
+
async def get_document_by_id(doc_id: str) -> str:
|
| 185 |
+
"""Retrieve a specific document by ID for deeper inspection.
|
| 186 |
+
|
| 187 |
+
Use this when you've identified a promising result from search
|
| 188 |
+
and want to see the complete profile details.
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
doc_id: The unique document/form ID (e.g., "donor_12345")
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Complete JSON representation of the document
|
| 195 |
+
"""
|
| 196 |
+
print(f"[Agentic RAG] get_document_by_id called - doc_id: '{doc_id}'")
|
| 197 |
+
if _vector_store is None:
|
| 198 |
+
return "Error: RAG tools not initialized."
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
result = await _vector_store.get_embedding(doc_id)
|
| 202 |
+
|
| 203 |
+
if result is None:
|
| 204 |
+
return f"Document with ID '{doc_id}' not found."
|
| 205 |
+
|
| 206 |
+
# Return full document details
|
| 207 |
+
document = {
|
| 208 |
+
"id": result.id,
|
| 209 |
+
"form_type": result.form_type,
|
| 210 |
+
"data": result.form_data
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
return json.dumps(document, indent=2, default=str)
|
| 214 |
+
except Exception as e:
|
| 215 |
+
return f"Retrieval error: {str(e)}"
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
@tool
|
| 219 |
+
async def list_available_categories() -> str:
|
| 220 |
+
"""List all unique values for filterable fields.
|
| 221 |
+
|
| 222 |
+
Use this first to understand what categories exist in the database
|
| 223 |
+
before performing filtered searches. Returns available:
|
| 224 |
+
- Form types (donor, volunteer)
|
| 225 |
+
- Countries (ASEAN country codes)
|
| 226 |
+
- Causes (education, health, etc.)
|
| 227 |
+
- Donor types (individual, corporate, foundation)
|
| 228 |
+
- Volunteer types (regular, event_based, skilled)
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
JSON object with distinct values for each category
|
| 232 |
+
"""
|
| 233 |
+
print("[Agentic RAG] list_available_categories called")
|
| 234 |
+
if _vector_store is None:
|
| 235 |
+
return "Error: RAG tools not initialized."
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
async with _vector_store.pool.connection() as conn:
|
| 239 |
+
async with conn.cursor() as cur:
|
| 240 |
+
# Get form type counts
|
| 241 |
+
await cur.execute("""
|
| 242 |
+
SELECT
|
| 243 |
+
metadata->>'form_type' as form_type,
|
| 244 |
+
COUNT(*) as count
|
| 245 |
+
FROM my_embeddings
|
| 246 |
+
GROUP BY metadata->>'form_type'
|
| 247 |
+
""")
|
| 248 |
+
form_types = {row[0]: row[1] for row in await cur.fetchall()}
|
| 249 |
+
|
| 250 |
+
# Get distinct countries
|
| 251 |
+
await cur.execute("""
|
| 252 |
+
SELECT DISTINCT text_content::json->>'country' as country
|
| 253 |
+
FROM my_embeddings
|
| 254 |
+
WHERE text_content::json->>'country' IS NOT NULL
|
| 255 |
+
""")
|
| 256 |
+
countries = [row[0] for row in await cur.fetchall() if row[0]]
|
| 257 |
+
|
| 258 |
+
# Get distinct donor types
|
| 259 |
+
await cur.execute("""
|
| 260 |
+
SELECT DISTINCT text_content::json->>'donor_type' as dtype
|
| 261 |
+
FROM my_embeddings
|
| 262 |
+
WHERE text_content::json->>'donor_type' IS NOT NULL
|
| 263 |
+
""")
|
| 264 |
+
donor_types = [row[0] for row in await cur.fetchall() if row[0]]
|
| 265 |
+
|
| 266 |
+
# Get distinct volunteer types
|
| 267 |
+
await cur.execute("""
|
| 268 |
+
SELECT DISTINCT text_content::json->>'volunteer_type' as vtype
|
| 269 |
+
FROM my_embeddings
|
| 270 |
+
WHERE text_content::json->>'volunteer_type' IS NOT NULL
|
| 271 |
+
""")
|
| 272 |
+
volunteer_types = [row[0] for row in await cur.fetchall() if row[0]]
|
| 273 |
+
|
| 274 |
+
# Get all causes (need to aggregate from arrays)
|
| 275 |
+
await cur.execute("""
|
| 276 |
+
SELECT text_content
|
| 277 |
+
FROM my_embeddings
|
| 278 |
+
WHERE text_content LIKE '%causes%'
|
| 279 |
+
LIMIT 100
|
| 280 |
+
""")
|
| 281 |
+
rows = await cur.fetchall()
|
| 282 |
+
|
| 283 |
+
all_causes = set()
|
| 284 |
+
for row in rows:
|
| 285 |
+
try:
|
| 286 |
+
if isinstance(row[0], str):
|
| 287 |
+
data = json.loads(row[0])
|
| 288 |
+
else:
|
| 289 |
+
data = row[0]
|
| 290 |
+
causes = data.get("causes", [])
|
| 291 |
+
if isinstance(causes, list):
|
| 292 |
+
all_causes.update(causes)
|
| 293 |
+
except (json.JSONDecodeError, TypeError):
|
| 294 |
+
pass
|
| 295 |
+
|
| 296 |
+
categories = {
|
| 297 |
+
"form_types": form_types,
|
| 298 |
+
"countries": sorted(countries),
|
| 299 |
+
"donor_types": sorted(donor_types),
|
| 300 |
+
"volunteer_types": sorted(volunteer_types),
|
| 301 |
+
"causes": sorted(all_causes),
|
| 302 |
+
"total_records": sum(form_types.values()) if form_types else 0
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
return json.dumps(categories, indent=2)
|
| 306 |
+
except Exception as e:
|
| 307 |
+
return f"Error listing categories: {str(e)}"
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
@tool
|
| 311 |
+
async def hybrid_search(
|
| 312 |
+
query: str,
|
| 313 |
+
country: Optional[str] = None,
|
| 314 |
+
form_type: Optional[str] = None,
|
| 315 |
+
causes: Optional[List[str]] = None,
|
| 316 |
+
limit: int = 10
|
| 317 |
+
) -> str:
|
| 318 |
+
"""Combine semantic search with metadata filters.
|
| 319 |
+
|
| 320 |
+
Use this for targeted searches that combine meaning (semantic)
|
| 321 |
+
with specific constraints (filters). More precise than pure
|
| 322 |
+
semantic search when you know specific criteria.
|
| 323 |
+
|
| 324 |
+
Args:
|
| 325 |
+
query: Natural language query for semantic matching
|
| 326 |
+
country: Optional country code filter (e.g., "SG", "MY", "TH")
|
| 327 |
+
form_type: Optional form type filter ("donor" or "volunteer")
|
| 328 |
+
causes: Optional list of cause categories to match
|
| 329 |
+
limit: Maximum number of results (default: 10)
|
| 330 |
+
|
| 331 |
+
Returns:
|
| 332 |
+
JSON formatted list of results matching both semantic query and filters
|
| 333 |
+
"""
|
| 334 |
+
print(f"[Agentic RAG] hybrid_search called - query: '{query}', country: {country}, form_type: {form_type}, causes: {causes}, limit: {limit}")
|
| 335 |
+
if _encoder is None or _vector_store is None:
|
| 336 |
+
return "Error: RAG tools not initialized."
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
# Encode the query
|
| 340 |
+
embedding = await _encoder.encode(query)
|
| 341 |
+
|
| 342 |
+
# Use cause-based hybrid search if causes specified
|
| 343 |
+
if causes and len(causes) > 0:
|
| 344 |
+
results = await _vector_store.find_by_causes(
|
| 345 |
+
target_causes=causes,
|
| 346 |
+
query_embedding=embedding,
|
| 347 |
+
limit=limit
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Apply additional filters if needed
|
| 351 |
+
if form_type or country:
|
| 352 |
+
filtered = []
|
| 353 |
+
for r in results:
|
| 354 |
+
if form_type and r.form_type != form_type:
|
| 355 |
+
continue
|
| 356 |
+
if country and r.form_data.get("country") != country:
|
| 357 |
+
continue
|
| 358 |
+
filtered.append(r)
|
| 359 |
+
results = filtered[:limit]
|
| 360 |
+
else:
|
| 361 |
+
# Standard similarity search with filters
|
| 362 |
+
results = await _vector_store.find_similar(
|
| 363 |
+
query_embedding=embedding,
|
| 364 |
+
form_type=form_type,
|
| 365 |
+
limit=limit,
|
| 366 |
+
country_filter=country
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
return _format_results(results)
|
| 370 |
+
except Exception as e:
|
| 371 |
+
return f"Hybrid search error: {str(e)}"
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
@tool
|
| 375 |
+
async def get_statistics() -> str:
|
| 376 |
+
"""Get overall statistics about the vector store.
|
| 377 |
+
|
| 378 |
+
Use this to understand the size and composition of the database
|
| 379 |
+
before starting your search.
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
JSON with counts by form type and other aggregate stats
|
| 383 |
+
"""
|
| 384 |
+
print("[Agentic RAG] get_statistics called")
|
| 385 |
+
if _vector_store is None:
|
| 386 |
+
return "Error: RAG tools not initialized."
|
| 387 |
+
|
| 388 |
+
try:
|
| 389 |
+
counts = await _vector_store.count_by_type()
|
| 390 |
+
return json.dumps({
|
| 391 |
+
"database_statistics": counts,
|
| 392 |
+
"description": "Number of entries by form type in the vector store"
|
| 393 |
+
}, indent=2)
|
| 394 |
+
except Exception as e:
|
| 395 |
+
return f"Error getting statistics: {str(e)}"
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
# Export all RAG tools as a list for easy registration
|
| 399 |
+
RAG_TOOLS = [
|
| 400 |
+
semantic_search,
|
| 401 |
+
filter_by_metadata,
|
| 402 |
+
get_document_by_id,
|
| 403 |
+
list_available_categories,
|
| 404 |
+
hybrid_search,
|
| 405 |
+
get_statistics,
|
| 406 |
+
]
|
tools/web_search.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Web search tool for searching charity organization information using OpenAI."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pprint import pprint
|
| 5 |
+
from typing import Dict, Any, Optional
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
from langchain_core.tools import tool
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Simple in-memory cache to avoid duplicate searches within a session
|
| 11 |
+
_search_cache: Dict[str, str] = {}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_openai_client():
|
| 15 |
+
"""Get OpenAI client instance."""
|
| 16 |
+
return OpenAI()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def clear_search_cache():
|
| 20 |
+
"""Clear the search cache. Call this at the start of a new conversation."""
|
| 21 |
+
global _search_cache
|
| 22 |
+
_search_cache.clear()
|
| 23 |
+
print("🗑️ Search cache cleared")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def openai_web_search(query: str, use_cache: bool = True) -> str:
|
| 27 |
+
"""Perform web search using OpenAI's web_search tool.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
query: The search query
|
| 31 |
+
use_cache: Whether to use cached results if available
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Search results as text
|
| 35 |
+
"""
|
| 36 |
+
# Check cache first
|
| 37 |
+
cache_key = query.lower().strip()
|
| 38 |
+
if use_cache and cache_key in _search_cache:
|
| 39 |
+
print("\n" + "=" * 50)
|
| 40 |
+
print("📦 RETURNING CACHED SEARCH RESULT")
|
| 41 |
+
print("=" * 50)
|
| 42 |
+
pprint({"query": query, "cached": True})
|
| 43 |
+
print("=" * 50 + "\n")
|
| 44 |
+
return _search_cache[cache_key]
|
| 45 |
+
|
| 46 |
+
print("\n" + "=" * 50)
|
| 47 |
+
print("🔍 OPENAI WEB SEARCH CALLED")
|
| 48 |
+
print("=" * 50)
|
| 49 |
+
pprint({"query": query})
|
| 50 |
+
print("=" * 50 + "\n")
|
| 51 |
+
|
| 52 |
+
client = get_openai_client()
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
response = client.responses.create(
|
| 56 |
+
model="gpt-5",
|
| 57 |
+
tools=[{"type": "web_search"}],
|
| 58 |
+
input=query
|
| 59 |
+
)
|
| 60 |
+
print("\n" + "-" * 50)
|
| 61 |
+
print("✅ SEARCH RESULTS RECEIVED")
|
| 62 |
+
print("-" * 50)
|
| 63 |
+
pprint({"output_length": len(response.output_text)})
|
| 64 |
+
print("-" * 50 + "\n")
|
| 65 |
+
|
| 66 |
+
# Cache the result
|
| 67 |
+
_search_cache[cache_key] = response.output_text
|
| 68 |
+
|
| 69 |
+
return response.output_text
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"\n❌ SEARCH FAILED: {str(e)}\n")
|
| 72 |
+
return f"Search failed: {str(e)}"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@tool
|
| 76 |
+
def search_charity_comprehensive(charity_name: str) -> str:
|
| 77 |
+
"""Search the web for comprehensive information about a charity organization.
|
| 78 |
+
|
| 79 |
+
This tool performs a SINGLE optimized search to find ALL relevant information
|
| 80 |
+
about a charity including:
|
| 81 |
+
- Mission and programs
|
| 82 |
+
- Charity ratings (Charity Navigator, GuideStar, BBB)
|
| 83 |
+
- Financial transparency and accountability
|
| 84 |
+
- Recent news and impact reports
|
| 85 |
+
- Contact information and ways to donate
|
| 86 |
+
|
| 87 |
+
Use this as your PRIMARY tool - it combines general info and ratings in one search.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
charity_name: The name of the charity organization to research.
|
| 91 |
+
Example: "Red Cross" or "Doctors Without Borders"
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
Comprehensive search results about the charity including ratings and programs.
|
| 95 |
+
"""
|
| 96 |
+
print("\n📋 TOOL CALLED: search_charity_comprehensive")
|
| 97 |
+
pprint({"charity_name": charity_name})
|
| 98 |
+
|
| 99 |
+
# Build a comprehensive query that covers all aspects in ONE search
|
| 100 |
+
comprehensive_query = (
|
| 101 |
+
f"{charity_name} charity nonprofit organization "
|
| 102 |
+
f"mission programs impact "
|
| 103 |
+
f"Charity Navigator rating GuideStar "
|
| 104 |
+
f"financial transparency accountability review"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
results = openai_web_search(comprehensive_query)
|
| 109 |
+
return results
|
| 110 |
+
except Exception as e:
|
| 111 |
+
return f"Search failed: {str(e)}. Please try again with a different query."
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@tool
|
| 115 |
+
def search_charity_info(query: str) -> str:
|
| 116 |
+
"""Search the web for information about a charity organization.
|
| 117 |
+
|
| 118 |
+
NOTE: Prefer using search_charity_comprehensive for most queries as it
|
| 119 |
+
combines general info and ratings in a single search.
|
| 120 |
+
|
| 121 |
+
Use this tool only when you need to search for something very specific
|
| 122 |
+
that isn't covered by comprehensive search.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
query: The search query about the charity organization.
|
| 126 |
+
Example: "Red Cross disaster relief programs 2024"
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Search results containing relevant information about the charity.
|
| 130 |
+
"""
|
| 131 |
+
print("\n📋 TOOL CALLED: search_charity_info")
|
| 132 |
+
pprint({"input_query": query})
|
| 133 |
+
|
| 134 |
+
# Enhance query for charity-specific searches
|
| 135 |
+
enhanced_query = f"{query} charity nonprofit organization"
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
results = openai_web_search(enhanced_query)
|
| 139 |
+
return results
|
| 140 |
+
except Exception as e:
|
| 141 |
+
return f"Search failed: {str(e)}. Please try again with a different query."
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@tool
|
| 145 |
+
def search_charity_ratings(charity_name: str) -> str:
|
| 146 |
+
"""Search for charity ratings and reviews from watchdog organizations.
|
| 147 |
+
|
| 148 |
+
NOTE: Prefer using search_charity_comprehensive as it already includes
|
| 149 |
+
rating information. Use this only if you specifically need MORE detailed
|
| 150 |
+
rating information after the comprehensive search.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
charity_name: The name of the charity to look up ratings for.
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
Information about the charity's ratings and accountability.
|
| 157 |
+
"""
|
| 158 |
+
print("\n⭐ TOOL CALLED: search_charity_ratings")
|
| 159 |
+
pprint({"charity_name": charity_name})
|
| 160 |
+
|
| 161 |
+
query = f"{charity_name} charity rating Charity Navigator GuideStar review"
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
results = openai_web_search(query)
|
| 165 |
+
return results
|
| 166 |
+
except Exception as e:
|
| 167 |
+
return f"Rating search failed: {str(e)}. Please try again."
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# List of all available tools for the charity search agent
|
| 171 |
+
# Put comprehensive search FIRST so the LLM prefers it
|
| 172 |
+
CHARITY_SEARCH_TOOLS = [search_charity_comprehensive, search_charity_info, search_charity_ratings]
|