Spaces:
Sleeping
Sleeping
JerameeUC commited on
Commit Β·
ecbc643
1
Parent(s): 071c820
7th Commit - All place holder code added.
Browse files- FLATTENED_CODE.txt +0 -0
- Makefile +59 -2
- anon_bot/handler.py +41 -2
- anon_bot/rules.py +90 -1
- docs/architecture.md +72 -1
- docs/design.md +71 -1
- docs/flowchart.png +0 -0
- examples/example-dev.py +36 -0
- examples/example.py +59 -5
- integrations/azure/bot_framework.py +39 -2
- integrations/email/ticket_stub.py +56 -1
- logged_in_bot/sentiment_azure.py +187 -0
- logged_in_bot/tools.py +224 -0
- memory/rag/indexer.py +343 -0
- memory/rag/retriever.py +267 -0
- memory/sessions.py +243 -0
- memory/store.py +143 -1
- nlu/pipeline.py +75 -1
- nlu/prompts.py +77 -0
- nlu/router.py +142 -0
- requirements-dev.txt +7 -0
- requirements-ml.txt +7 -0
- requirements.txt +10 -13
- scripts/check_compliance.py +79 -1
- scripts/run_local.sh +43 -3
- scripts/seed_data.py +92 -1
- tests/test_anon_bot.py +119 -1
- tests/test_guardrails.py +39 -1
- tests/test_indexer.py +24 -0
- tests/test_logged_in_bot.py +83 -1
- tests/test_memory.py +94 -1
- tests/test_nlu.py +45 -1
- tests/test_retriever.py +34 -0
- tests/test_sessions.py +21 -0
- tree.txt +4 -5
FLATTENED_CODE.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Makefile
CHANGED
|
@@ -1,11 +1,68 @@
|
|
| 1 |
-
.PHONY: dev test run seed check
|
|
|
|
|
|
|
| 2 |
dev:
|
| 3 |
pip install -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
test:
|
| 5 |
-
pytest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
run:
|
| 7 |
export PYTHONPATH=. && python -c "from storefront_chatbot.app.app import build; build().launch(server_name='0.0.0.0', server_port=7860)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
seed:
|
| 9 |
python storefront_chatbot/scripts/seed_data.py
|
|
|
|
| 10 |
check:
|
| 11 |
python storefront_chatbot/scripts/check_compliance.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: dev ml dev-deps example example-dev test run seed check lint fmt typecheck clean serve all ci coverage docker-build docker-run
|
| 2 |
+
|
| 3 |
+
# --- setup ---
|
| 4 |
dev:
|
| 5 |
pip install -r requirements.txt
|
| 6 |
+
|
| 7 |
+
ml:
|
| 8 |
+
pip install -r requirements-ml.txt
|
| 9 |
+
|
| 10 |
+
dev-deps:
|
| 11 |
+
pip install -r requirements-dev.txt
|
| 12 |
+
|
| 13 |
+
# --- one-stop local env + tests ---
|
| 14 |
+
example-dev: dev dev-deps
|
| 15 |
+
pytest
|
| 16 |
+
@echo "β
Dev environment ready. Try 'make example' to run the CLI demo."
|
| 17 |
+
|
| 18 |
+
# --- tests & coverage ---
|
| 19 |
test:
|
| 20 |
+
pytest
|
| 21 |
+
|
| 22 |
+
coverage:
|
| 23 |
+
pytest --cov=storefront_chatbot --cov-report=term-missing
|
| 24 |
+
|
| 25 |
+
# --- run app ---
|
| 26 |
run:
|
| 27 |
export PYTHONPATH=. && python -c "from storefront_chatbot.app.app import build; build().launch(server_name='0.0.0.0', server_port=7860)"
|
| 28 |
+
|
| 29 |
+
# --- example demo ---
|
| 30 |
+
example:
|
| 31 |
+
export PYTHONPATH=. && python example/example.py "hello world"
|
| 32 |
+
|
| 33 |
+
# --- data & checks ---
|
| 34 |
seed:
|
| 35 |
python storefront_chatbot/scripts/seed_data.py
|
| 36 |
+
|
| 37 |
check:
|
| 38 |
python storefront_chatbot/scripts/check_compliance.py
|
| 39 |
+
|
| 40 |
+
# --- quality gates ---
|
| 41 |
+
lint:
|
| 42 |
+
flake8 storefront_chatbot
|
| 43 |
+
|
| 44 |
+
fmt:
|
| 45 |
+
black .
|
| 46 |
+
isort .
|
| 47 |
+
|
| 48 |
+
typecheck:
|
| 49 |
+
mypy .
|
| 50 |
+
|
| 51 |
+
# --- hygiene ---
|
| 52 |
+
clean:
|
| 53 |
+
find . -type d -name "__pycache__" -exec rm -rf {} +
|
| 54 |
+
rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage
|
| 55 |
+
|
| 56 |
+
serve:
|
| 57 |
+
export PYTHONPATH=. && uvicorn storefront_chatbot.app.app:build --reload --host 0.0.0.0 --port 7860
|
| 58 |
+
|
| 59 |
+
# --- docker (optional) ---
|
| 60 |
+
docker-build:
|
| 61 |
+
docker build -t storefront-chatbot .
|
| 62 |
+
|
| 63 |
+
docker-run:
|
| 64 |
+
docker run -p 7860:7860 storefront-chatbot
|
| 65 |
+
|
| 66 |
+
# --- bundles ---
|
| 67 |
+
all: clean check test
|
| 68 |
+
ci: lint typecheck coverage
|
anon_bot/handler.py
CHANGED
|
@@ -1,3 +1,42 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# anon_bot/handler.py
|
| 2 |
+
"""
|
| 3 |
+
Stateless(ish) turn handler for the anonymous chatbot.
|
| 4 |
+
Signature kept tiny: handle_turn(message, history, user) -> new_history
|
| 5 |
+
- message: str (user text)
|
| 6 |
+
- history: list of [speaker, text] or None
|
| 7 |
+
- user: dict-like info (ignored here, but accepted for compatibility)
|
| 8 |
+
"""
|
| 9 |
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
from typing import List, Tuple, Any
|
| 12 |
+
from . import rules
|
| 13 |
+
|
| 14 |
+
History = List[Tuple[str, str]] # [("user","..."), ("bot","...")]
|
| 15 |
+
|
| 16 |
+
def _coerce_history(h: Any) -> History:
|
| 17 |
+
if not h:
|
| 18 |
+
return []
|
| 19 |
+
# normalize to tuple pairs
|
| 20 |
+
out: History = []
|
| 21 |
+
for item in h:
|
| 22 |
+
try:
|
| 23 |
+
who, text = item[0], item[1]
|
| 24 |
+
except Exception:
|
| 25 |
+
continue
|
| 26 |
+
out.append((str(who), str(text)))
|
| 27 |
+
return out
|
| 28 |
+
|
| 29 |
+
def handle_turn(message: str, history: History | None, user: dict | None) -> History:
|
| 30 |
+
hist = _coerce_history(history)
|
| 31 |
+
user_text = (message or "").strip()
|
| 32 |
+
if user_text:
|
| 33 |
+
hist.append(("user", user_text))
|
| 34 |
+
rep = rules.reply_for(user_text, hist)
|
| 35 |
+
hist.append(("bot", rep.text))
|
| 36 |
+
return hist
|
| 37 |
+
|
| 38 |
+
# Convenience: one-shot stringβstring (used by plain JSON endpoints)
|
| 39 |
+
def handle_text(message: str, history: History | None = None) -> str:
|
| 40 |
+
new_hist = handle_turn(message, history, user=None)
|
| 41 |
+
# last item is bot reply
|
| 42 |
+
return new_hist[-1][1] if new_hist else ""
|
anon_bot/rules.py
CHANGED
|
@@ -1 +1,90 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# anon_bot/rules.py
|
| 2 |
+
"""
|
| 3 |
+
Lightweight rule set for an anonymous chatbot.
|
| 4 |
+
No external providers required. Pure-Python, deterministic.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import Dict, List, Tuple
|
| 10 |
+
|
| 11 |
+
# ---- Types ----
|
| 12 |
+
History = List[Tuple[str, str]] # e.g., [("user","hi"), ("bot","hello!")]
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class Reply:
|
| 16 |
+
text: str
|
| 17 |
+
meta: Dict[str, str] | None = None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def normalize(s: str) -> str:
|
| 21 |
+
return " ".join((s or "").strip().split()).lower()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def capabilities() -> List[str]:
|
| 25 |
+
return [
|
| 26 |
+
"help",
|
| 27 |
+
"reverse <text>",
|
| 28 |
+
"echo <text>",
|
| 29 |
+
"small talk (hi/hello/hey)",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def intent_of(text: str) -> str:
|
| 34 |
+
t = normalize(text)
|
| 35 |
+
if not t:
|
| 36 |
+
return "empty"
|
| 37 |
+
if t in {"help", "/help", "capabilities"}:
|
| 38 |
+
return "help"
|
| 39 |
+
if t.startswith("reverse "):
|
| 40 |
+
return "reverse"
|
| 41 |
+
if t.startswith("echo "):
|
| 42 |
+
return "echo"
|
| 43 |
+
if t in {"hi", "hello", "hey"}:
|
| 44 |
+
return "greet"
|
| 45 |
+
return "chat"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def handle_help() -> Reply:
|
| 49 |
+
lines = ["I can:"]
|
| 50 |
+
for c in capabilities():
|
| 51 |
+
lines.append(f"- {c}")
|
| 52 |
+
return Reply("\n".join(lines))
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def handle_reverse(t: str) -> Reply:
|
| 56 |
+
payload = t.split(" ", 1)[1] if " " in t else ""
|
| 57 |
+
return Reply(payload[::-1] if payload else "(nothing to reverse)")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def handle_echo(t: str) -> Reply:
|
| 61 |
+
payload = t.split(" ", 1)[1] if " " in t else ""
|
| 62 |
+
return Reply(payload or "(nothing to echo)")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def handle_greet() -> Reply:
|
| 66 |
+
return Reply("Hello! π Type 'help' to see what I can do.")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def handle_chat(t: str, history: History) -> Reply:
|
| 70 |
+
# Very simple βELIZA-ishβ fallback.
|
| 71 |
+
if "help" in t:
|
| 72 |
+
return handle_help()
|
| 73 |
+
if "you" in t and "who" in t:
|
| 74 |
+
return Reply("I'm a tiny anonymous chatbot kernel.")
|
| 75 |
+
return Reply("Noted. (anonymous mode) Type 'help' for commands.")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def reply_for(text: str, history: History) -> Reply:
|
| 79 |
+
it = intent_of(text)
|
| 80 |
+
if it == "empty":
|
| 81 |
+
return Reply("Please type something. Try 'help'.")
|
| 82 |
+
if it == "help":
|
| 83 |
+
return handle_help()
|
| 84 |
+
if it == "reverse":
|
| 85 |
+
return handle_reverse(text)
|
| 86 |
+
if it == "echo":
|
| 87 |
+
return handle_echo(text)
|
| 88 |
+
if it == "greet":
|
| 89 |
+
return handle_greet()
|
| 90 |
+
return handle_chat(text.lower(), history)
|
docs/architecture.md
CHANGED
|
@@ -1,2 +1,73 @@
|
|
| 1 |
<!-- /docs/slides/architecture.md -->
|
| 2 |
-
# Architecture
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
<!-- /docs/slides/architecture.md -->
|
| 2 |
+
# Architecture
|
| 3 |
+
|
| 4 |
+
This system follows a **modular chatbot architecture** built around a clear flow of data from the user interface to external services and back. The design emphasizes separation of concerns, allowing each module to handle a specific responsibility while keeping the overall system simple to test and extend.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## High-Level Flow (tied to flowchart)
|
| 9 |
+
|
| 10 |
+
1. **User Interface (UI)**
|
| 11 |
+
- The entry point for user interaction.
|
| 12 |
+
- Implemented through a web client (e.g., Gradio, HTML templates, or API endpoint).
|
| 13 |
+
- Captures user input and displays bot responses.
|
| 14 |
+
|
| 15 |
+
2. **Router / Core Logic**
|
| 16 |
+
- Handles conversation state and routes messages.
|
| 17 |
+
- Delegates to either the anonymous bot, logged-in bot, or agentic extensions.
|
| 18 |
+
- Imports lightweight rules from `anon_bot/rules.py` for anonymous sessions, and integrates with advanced providers for logged-in sessions.
|
| 19 |
+
|
| 20 |
+
3. **NLU (Natural Language Understanding)**
|
| 21 |
+
- Managed by the `nlu/` pipeline (intent recognition, prompts, and routing).
|
| 22 |
+
- Provides preprocessing, normalization, and optional summarization/RAG.
|
| 23 |
+
- Keeps the system extensible for additional models without changing the rest of the stack.
|
| 24 |
+
|
| 25 |
+
4. **Memory & Context Layer**
|
| 26 |
+
- Implemented in `memory/` (sessions, store, and optional RAG retriever/indexer).
|
| 27 |
+
- Stores session history, enabling context-aware responses.
|
| 28 |
+
- Supports modular backends (in-memory, file-based, or vector index).
|
| 29 |
+
|
| 30 |
+
5. **External AI Service Connector (optional)**
|
| 31 |
+
- For logged-in flows, integrates with cloud AIaaS (e.g., Azure, HuggingFace, or open-source LLMs).
|
| 32 |
+
- Uses `logged_in_bot/sentiment_azure.py` or `agenticcore/providers_unified.py`.
|
| 33 |
+
- Provides NLP services like sentiment analysis or summarization.
|
| 34 |
+
- Disabled in anonymous mode for privacy.
|
| 35 |
+
|
| 36 |
+
6. **Guardrails & Safety**
|
| 37 |
+
- Defined in `guardrails/` (PII redaction, safety filters).
|
| 38 |
+
- Applied before responses are shown to the user.
|
| 39 |
+
- Ensures compliance with privacy/security requirements.
|
| 40 |
+
|
| 41 |
+
7. **Outputs**
|
| 42 |
+
- Bot response returned to the UI.
|
| 43 |
+
- Logs written via `core/logging.py` for traceability and debugging.
|
| 44 |
+
- Optional screenshots and reports recorded for evaluation.
|
| 45 |
+
|
| 46 |
+
---
|
| 47 |
+
|
| 48 |
+
## Key Principles
|
| 49 |
+
|
| 50 |
+
- **Modularity**: Each part of the flow is a self-contained module (UI, NLU, memory, guardrails).
|
| 51 |
+
- **Swap-in Providers**: Agentic core can switch between local rules, RAG memory, or external APIs.
|
| 52 |
+
- **Anonymous vs Logged-In**: Anonymous bot uses lightweight rules with no external calls; logged-in bot can call providers.
|
| 53 |
+
- **Extensibility**: Flowchart design makes it easy to add summarization, conversation modes, or other βagenticβ behaviors without rewriting the core.
|
| 54 |
+
- **Resilience**: If an external service fails, the system degrades gracefully to local responses.
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## Mapping to Repo Structure
|
| 59 |
+
|
| 60 |
+
- `app/` β User-facing entrypoint (routes, HTML, API).
|
| 61 |
+
- `anon_bot/` β Anonymous chatbot rules + handler.
|
| 62 |
+
- `logged_in_bot/` β Provider-based flows for authenticated users.
|
| 63 |
+
- `nlu/` β Intent routing, prompts, pipeline.
|
| 64 |
+
- `memory/` β Session management + RAG integration.
|
| 65 |
+
- `guardrails/` β Safety filters + PII redaction.
|
| 66 |
+
- `agenticcore/` β Core integration logic and unified providers.
|
| 67 |
+
- `docs/flowchart.png` β Visual representation of this architecture.
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
## Summary
|
| 72 |
+
|
| 73 |
+
The architecture ensures a **clean separation between interface, logic, and services**, enabling experimentation with different providers while guaranteeing a safe, privacy-friendly anonymous mode. The flowchart illustrates this layered approach: input β logic β NLU/memory β optional AIaaS β guardrails β output.
|
docs/design.md
CHANGED
|
@@ -1,2 +1,72 @@
|
|
| 1 |
<!-- /docs/slides/design.md -->
|
| 2 |
-
# Design
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
<!-- /docs/slides/design.md -->
|
| 2 |
+
# Design Notes
|
| 3 |
+
|
| 4 |
+
These notes document the reasoning behind major design choices, focusing on **API usage**, **security considerations**, and **tradeoffs** made during development.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## API Notes
|
| 9 |
+
|
| 10 |
+
- **Anonymous vs Logged-In Flows**
|
| 11 |
+
- The **anonymous chatbot** relies purely on local rules (`anon_bot/rules.py`) and does not call any external services.
|
| 12 |
+
- The **logged-in chatbot** integrates with external AIaaS endpoints (e.g., Azure, HuggingFace, or other NLP providers) via modules in `logged_in_bot/` and `agenticcore/providers_unified.py`.
|
| 13 |
+
|
| 14 |
+
- **Endpoints**
|
| 15 |
+
- `/plain-chat` β Anonymous flow; maps to `logic.handle_text`.
|
| 16 |
+
- `/api/messages` β For framework compatibility (e.g., BotFramework or FastAPI demo).
|
| 17 |
+
- `/healthz` β Lightweight health check for monitoring.
|
| 18 |
+
|
| 19 |
+
- **NLU Pipeline**
|
| 20 |
+
- Intent routing (`nlu/router.py`) determines if user input should be treated as a direct command, a small-talk message, or passed to providers.
|
| 21 |
+
- Prompts and transformations are managed in `nlu/prompts.py` to centralize natural language templates.
|
| 22 |
+
|
| 23 |
+
- **Memory Integration**
|
| 24 |
+
- Session memory stored in `memory/sessions.py`.
|
| 25 |
+
- Optional RAG indexer (`memory/rag/indexer.py`) allows document retrieval for extended context.
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## Security Considerations
|
| 30 |
+
|
| 31 |
+
- **API Keys**
|
| 32 |
+
- Keys for external services are never hard-coded.
|
| 33 |
+
- They are pulled from environment variables or `.env` files (via `core/config.py`).
|
| 34 |
+
|
| 35 |
+
- **Data Handling**
|
| 36 |
+
- Anonymous mode never sends user text outside the local process.
|
| 37 |
+
- Logged-in mode applies guardrails before making external calls.
|
| 38 |
+
- Sensitive information (emails, IDs) is redacted using `guardrails/pii_redaction.py`.
|
| 39 |
+
|
| 40 |
+
- **Logging**
|
| 41 |
+
- Logs are structured (`core/logging.py`) and omit sensitive data by default.
|
| 42 |
+
- Debug mode can be enabled for local testing but should not be used in production.
|
| 43 |
+
|
| 44 |
+
- **Privacy**
|
| 45 |
+
- Anonymous sessions are ephemeral: conversation state is stored only in memory unless explicitly persisted.
|
| 46 |
+
- Logged-in sessions may optionally persist data, but only with user consent.
|
| 47 |
+
|
| 48 |
+
---
|
| 49 |
+
|
| 50 |
+
## Tradeoffs
|
| 51 |
+
|
| 52 |
+
- **Rule-Based vs AI-Powered**
|
| 53 |
+
- Rule-based responses are deterministic, fast, and private but limited in sophistication.
|
| 54 |
+
- AI-powered responses (via providers) allow richer understanding but introduce latency, costs, and privacy risks.
|
| 55 |
+
|
| 56 |
+
- **Extensibility vs Simplicity**
|
| 57 |
+
- Chose a **modular repo structure** (separate folders for `anon_bot`, `logged_in_bot`, `memory`, `nlu`) to allow future growth.
|
| 58 |
+
- This adds some boilerplate overhead but makes it easier to swap components.
|
| 59 |
+
|
| 60 |
+
- **Performance vs Accuracy**
|
| 61 |
+
- Non-functional requirement: responses within 2 seconds for 95% of requests.
|
| 62 |
+
- This meant prioritizing lightweight providers and caching over heavyweight models.
|
| 63 |
+
|
| 64 |
+
- **Anonymous Mode as Default**
|
| 65 |
+
- Defaulting to anonymous mode ensures the system works offline and avoids external dependencies.
|
| 66 |
+
- Tradeoff: limits functionality until the user explicitly opts in for a logged-in session.
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## Summary
|
| 71 |
+
|
| 72 |
+
The design balances **privacy, modularity, and extensibility**. By cleanly separating anonymous and logged-in paths, the system can run entirely offline while still supporting richer AI features when configured. Security and privacy are first-class concerns, and tradeoffs were made to keep the system lightweight, testable, and compliant with project constraints.
|
docs/flowchart.png
DELETED
examples/example-dev.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /example/example-dev.py
|
| 2 |
+
"""
|
| 3 |
+
Dev environment sanity example.
|
| 4 |
+
|
| 5 |
+
- Imports ChatBot
|
| 6 |
+
- Sends a test message
|
| 7 |
+
- Prints the JSON reply
|
| 8 |
+
- Confirms basic dependencies work
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python example/example-dev.py
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import sys
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from agenticcore.chatbot.services import ChatBot
|
| 19 |
+
except ImportError as e:
|
| 20 |
+
print("β Could not import ChatBot. Did you set PYTHONPATH or install dependencies?")
|
| 21 |
+
sys.exit(1)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def main():
|
| 25 |
+
bot = ChatBot()
|
| 26 |
+
msg = "Hello from example-dev!"
|
| 27 |
+
result = bot.reply(msg)
|
| 28 |
+
|
| 29 |
+
print("β
Dev environment is working")
|
| 30 |
+
print("Input:", msg)
|
| 31 |
+
print("Reply JSON:")
|
| 32 |
+
print(json.dumps(result, indent=2))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
main()
|
examples/example.py
CHANGED
|
@@ -1,9 +1,63 @@
|
|
| 1 |
# /example/example.py
|
| 2 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import json
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
if __name__ == "__main__":
|
| 7 |
-
|
| 8 |
-
result = bot.reply("hello world")
|
| 9 |
-
print(json.dumps(result, indent=2))
|
|
|
|
| 1 |
# /example/example.py
|
| 2 |
+
"""
|
| 3 |
+
Simple CLI/REPL example for the ChatBot.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python example/example.py "hello world"
|
| 7 |
+
python example/example.py # enters interactive mode
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
import json
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
from agenticcore.chatbot.services import ChatBot
|
| 16 |
+
except ImportError as e:
|
| 17 |
+
print("β Could not import ChatBot. Did you set PYTHONPATH or install agenticcore?")
|
| 18 |
+
sys.exit(1)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
parser = argparse.ArgumentParser(description="ChatBot CLI/REPL example")
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"message",
|
| 25 |
+
nargs="*",
|
| 26 |
+
help="Message to send. Leave empty to start interactive mode.",
|
| 27 |
+
)
|
| 28 |
+
args = parser.parse_args()
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
bot = ChatBot()
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"β Failed to initialize ChatBot: {e}")
|
| 34 |
+
sys.exit(1)
|
| 35 |
+
|
| 36 |
+
if args.message:
|
| 37 |
+
# One-shot mode
|
| 38 |
+
msg = " ".join(args.message)
|
| 39 |
+
result = bot.reply(msg)
|
| 40 |
+
print(json.dumps(result, indent=2))
|
| 41 |
+
else:
|
| 42 |
+
# Interactive REPL
|
| 43 |
+
print("π¬ Interactive mode. Type 'quit' or 'exit' to stop.")
|
| 44 |
+
while True:
|
| 45 |
+
try:
|
| 46 |
+
msg = input("> ").strip()
|
| 47 |
+
except (EOFError, KeyboardInterrupt):
|
| 48 |
+
print("\nπ Exiting.")
|
| 49 |
+
break
|
| 50 |
+
|
| 51 |
+
if msg.lower() in {"quit", "exit"}:
|
| 52 |
+
print("π Goodbye.")
|
| 53 |
+
break
|
| 54 |
+
|
| 55 |
+
if not msg:
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
result = bot.reply(msg)
|
| 59 |
+
print(json.dumps(result, indent=2))
|
| 60 |
+
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
| 63 |
+
main()
|
|
|
|
|
|
integrations/azure/bot_framework.py
CHANGED
|
@@ -1,2 +1,39 @@
|
|
| 1 |
-
# /
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# integrations/azure/bot_framework.py
|
| 2 |
+
"""
|
| 3 |
+
Azure Bot Framework integration (stub).
|
| 4 |
+
|
| 5 |
+
This module is a placeholder for connecting the chatbot
|
| 6 |
+
to Microsoft Azure Bot Framework. It is optional β
|
| 7 |
+
the anonymous bot does not depend on this code.
|
| 8 |
+
|
| 9 |
+
If you want to enable Azure:
|
| 10 |
+
1. Install `botbuilder` SDK (pip install botbuilder-core aiohttp).
|
| 11 |
+
2. Fill in the adapter setup and message handling below.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from typing import Any, Dict
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class AzureBotFrameworkNotConfigured(Exception):
|
| 18 |
+
"""Raised when Azure Bot Framework is called but not set up."""
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def init_adapter(config: Dict[str, Any] | None = None):
|
| 22 |
+
"""
|
| 23 |
+
Placeholder for BotFrameworkAdapter initialization.
|
| 24 |
+
Returns a dummy object unless replaced with actual Azure code.
|
| 25 |
+
"""
|
| 26 |
+
raise AzureBotFrameworkNotConfigured(
|
| 27 |
+
"Azure Bot Framework integration is not configured. "
|
| 28 |
+
"Use anon_bot for local testing."
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def handle_activity(activity: Dict[str, Any]) -> Dict[str, Any]:
|
| 33 |
+
"""
|
| 34 |
+
Placeholder for handling an incoming Bot Framework activity.
|
| 35 |
+
Echoes back a dummy response if called directly.
|
| 36 |
+
"""
|
| 37 |
+
if not activity:
|
| 38 |
+
return {"type": "message", "text": "(no activity received)"}
|
| 39 |
+
return {"type": "message", "text": f"Echo: {activity.get('text', '')}"}
|
integrations/email/ticket_stub.py
CHANGED
|
@@ -1,2 +1,57 @@
|
|
| 1 |
# /intergrations/email/ticket_stub.py
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /intergrations/email/ticket_stub.py
|
| 2 |
+
"""
|
| 3 |
+
Email / Ticket System Stub.
|
| 4 |
+
|
| 5 |
+
This module simulates creating a support ticket via email.
|
| 6 |
+
It is a placeholder β no actual emails are sent.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, Any
|
| 10 |
+
import datetime
|
| 11 |
+
import uuid
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TicketStub:
|
| 15 |
+
"""
|
| 16 |
+
A stub ticketing system that generates a fake ticket ID
|
| 17 |
+
and stores basic info in memory.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.tickets: Dict[str, Dict[str, Any]] = {}
|
| 22 |
+
|
| 23 |
+
def create_ticket(self, subject: str, body: str, user: str | None = None) -> Dict[str, Any]:
|
| 24 |
+
"""
|
| 25 |
+
Create a fake support ticket.
|
| 26 |
+
Returns a dictionary with ticket metadata.
|
| 27 |
+
"""
|
| 28 |
+
ticket_id = str(uuid.uuid4())
|
| 29 |
+
ticket = {
|
| 30 |
+
"id": ticket_id,
|
| 31 |
+
"subject": subject,
|
| 32 |
+
"body": body,
|
| 33 |
+
"user": user or "anonymous",
|
| 34 |
+
"created_at": datetime.datetime.utcnow().isoformat() + "Z",
|
| 35 |
+
"status": "open",
|
| 36 |
+
}
|
| 37 |
+
self.tickets[ticket_id] = ticket
|
| 38 |
+
return ticket
|
| 39 |
+
|
| 40 |
+
def get_ticket(self, ticket_id: str) -> Dict[str, Any] | None:
|
| 41 |
+
"""Retrieve a ticket by ID if it exists."""
|
| 42 |
+
return self.tickets.get(ticket_id)
|
| 43 |
+
|
| 44 |
+
def list_tickets(self) -> list[Dict[str, Any]]:
|
| 45 |
+
"""Return all created tickets."""
|
| 46 |
+
return list(self.tickets.values())
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Singleton for convenience
|
| 50 |
+
stub = TicketStub()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def create_ticket(subject: str, body: str, user: str | None = None) -> Dict[str, Any]:
|
| 54 |
+
"""
|
| 55 |
+
Module-level shortcut.
|
| 56 |
+
"""
|
| 57 |
+
return stub.create_ticket(subject, body, user)
|
logged_in_bot/sentiment_azure.py
CHANGED
|
@@ -1 +1,188 @@
|
|
| 1 |
# /logged_in_bot/sentiment_azure.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /logged_in_bot/sentiment_azure.py
|
| 2 |
+
"""
|
| 3 |
+
Optional Azure Sentiment integration with safe local fallback.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
from logged_in_bot.sentiment_azure import analyze_sentiment, SentimentResult
|
| 7 |
+
|
| 8 |
+
res = analyze_sentiment("I love this!")
|
| 9 |
+
print(res.label, res.score, res.backend) # e.g., "positive", 0.92, "local"
|
| 10 |
+
|
| 11 |
+
Environment (Azure path only):
|
| 12 |
+
- AZURE_LANGUAGE_ENDPOINT or MICROSOFT_AI_ENDPOINT
|
| 13 |
+
- AZURE_LANGUAGE_KEY or MICROSOFT_AI_KEY
|
| 14 |
+
|
| 15 |
+
If the Azure SDK or env vars are missing, we automatically fall back to a
|
| 16 |
+
deterministic, dependency-free heuristic that is fast and good enough for tests.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
from dataclasses import dataclass
|
| 21 |
+
from typing import Optional, Tuple
|
| 22 |
+
import os
|
| 23 |
+
import re
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ---------------------------
|
| 27 |
+
# Public dataclass & API
|
| 28 |
+
# ---------------------------
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class SentimentResult:
|
| 32 |
+
label: str # "positive" | "neutral" | "negative"
|
| 33 |
+
score: float # 0.0 .. 1.0 (confidence-like)
|
| 34 |
+
backend: str # "azure" | "local"
|
| 35 |
+
raw: Optional[dict] = None # provider raw payload if available
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def analyze_sentiment(text: str) -> SentimentResult:
|
| 39 |
+
"""
|
| 40 |
+
Analyze sentiment using Azure if configured, otherwise use local heuristic.
|
| 41 |
+
|
| 42 |
+
Never raises on normal use β returns a result even if Azure is misconfigured,
|
| 43 |
+
satisfying 'graceful degradation' requirements.
|
| 44 |
+
"""
|
| 45 |
+
text = (text or "").strip()
|
| 46 |
+
if not text:
|
| 47 |
+
return SentimentResult(label="neutral", score=0.5, backend="local", raw={"reason": "empty"})
|
| 48 |
+
|
| 49 |
+
# Try Azure first (only if fully configured and package available)
|
| 50 |
+
azure_ready, why = _is_azure_ready()
|
| 51 |
+
if azure_ready:
|
| 52 |
+
try:
|
| 53 |
+
return _azure_sentiment(text)
|
| 54 |
+
except Exception as e:
|
| 55 |
+
# Degrade gracefully to local
|
| 56 |
+
return _local_sentiment(text, note=f"azure_error: {e!r}")
|
| 57 |
+
else:
|
| 58 |
+
# Go local immediately
|
| 59 |
+
return _local_sentiment(text, note=why)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ---------------------------
|
| 63 |
+
# Azure path (optional)
|
| 64 |
+
# ---------------------------
|
| 65 |
+
|
| 66 |
+
def _is_azure_ready() -> Tuple[bool, str]:
|
| 67 |
+
"""
|
| 68 |
+
Check env + optional SDK presence without importing heavy modules unless needed.
|
| 69 |
+
"""
|
| 70 |
+
endpoint = os.getenv("AZURE_LANGUAGE_ENDPOINT") or os.getenv("MICROSOFT_AI_ENDPOINT")
|
| 71 |
+
key = os.getenv("AZURE_LANGUAGE_KEY") or os.getenv("MICROSOFT_AI_KEY")
|
| 72 |
+
if not endpoint or not key:
|
| 73 |
+
return False, "missing_env"
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
# Light import check
|
| 77 |
+
import importlib
|
| 78 |
+
client_mod = importlib.import_module("azure.ai.textanalytics")
|
| 79 |
+
cred_mod = importlib.import_module("azure.core.credentials")
|
| 80 |
+
# Quick sanity on expected attributes
|
| 81 |
+
getattr(client_mod, "TextAnalyticsClient")
|
| 82 |
+
getattr(cred_mod, "AzureKeyCredential")
|
| 83 |
+
except Exception:
|
| 84 |
+
return False, "sdk_not_installed"
|
| 85 |
+
|
| 86 |
+
return True, "ok"
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _azure_sentiment(text: str) -> SentimentResult:
|
| 90 |
+
"""
|
| 91 |
+
Call Azure Text Analytics (Sentiment). Requires:
|
| 92 |
+
pip install azure-ai-textanalytics
|
| 93 |
+
"""
|
| 94 |
+
from azure.ai.textanalytics import TextAnalyticsClient
|
| 95 |
+
from azure.core.credentials import AzureKeyCredential
|
| 96 |
+
|
| 97 |
+
endpoint = os.getenv("AZURE_LANGUAGE_ENDPOINT") or os.getenv("MICROSOFT_AI_ENDPOINT")
|
| 98 |
+
key = os.getenv("AZURE_LANGUAGE_KEY") or os.getenv("MICROSOFT_AI_KEY")
|
| 99 |
+
|
| 100 |
+
client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
|
| 101 |
+
# API expects a list of documents
|
| 102 |
+
resp = client.analyze_sentiment(documents=[text], show_opinion_mining=False)
|
| 103 |
+
doc = resp[0]
|
| 104 |
+
|
| 105 |
+
# Map Azure scores to our schema
|
| 106 |
+
label = (doc.sentiment or "neutral").lower()
|
| 107 |
+
# Choose max score among pos/neu/neg as "confidence-like"
|
| 108 |
+
score_map = {
|
| 109 |
+
"positive": doc.confidence_scores.positive,
|
| 110 |
+
"neutral": doc.confidence_scores.neutral,
|
| 111 |
+
"negative": doc.confidence_scores.negative,
|
| 112 |
+
}
|
| 113 |
+
score = float(score_map.get(label, max(score_map.values())))
|
| 114 |
+
raw = {
|
| 115 |
+
"sentiment": doc.sentiment,
|
| 116 |
+
"confidence_scores": {
|
| 117 |
+
"positive": doc.confidence_scores.positive,
|
| 118 |
+
"neutral": doc.confidence_scores.neutral,
|
| 119 |
+
"negative": doc.confidence_scores.negative,
|
| 120 |
+
},
|
| 121 |
+
}
|
| 122 |
+
return SentimentResult(label=label, score=score, backend="azure", raw=raw)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# ---------------------------
|
| 126 |
+
# Local fallback (no deps)
|
| 127 |
+
# ---------------------------
|
| 128 |
+
|
| 129 |
+
_POSITIVE = {
|
| 130 |
+
"good", "great", "love", "excellent", "amazing", "awesome", "happy",
|
| 131 |
+
"wonderful", "fantastic", "like", "enjoy", "cool", "nice", "positive",
|
| 132 |
+
}
|
| 133 |
+
_NEGATIVE = {
|
| 134 |
+
"bad", "terrible", "hate", "awful", "horrible", "sad", "angry",
|
| 135 |
+
"worse", "worst", "broken", "bug", "issue", "problem", "negative",
|
| 136 |
+
}
|
| 137 |
+
# Simple negation tokens to flip nearby polarity
|
| 138 |
+
_NEGATIONS = {"not", "no", "never", "n't"}
|
| 139 |
+
|
| 140 |
+
_WORD_RE = re.compile(r"[A-Za-z']+")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _local_sentiment(text: str, note: str | None = None) -> SentimentResult:
|
| 144 |
+
"""
|
| 145 |
+
Tiny lexicon + negation heuristic:
|
| 146 |
+
- Tokenize letters/apostrophes
|
| 147 |
+
- Score +1 for positive, -1 for negative
|
| 148 |
+
- If a negation appears within the previous 3 tokens, flip the sign
|
| 149 |
+
- Convert final score to pseudo-confidence 0..1
|
| 150 |
+
"""
|
| 151 |
+
tokens = [t.lower() for t in _WORD_RE.findall(text)]
|
| 152 |
+
score = 0
|
| 153 |
+
for i, tok in enumerate(tokens):
|
| 154 |
+
window_neg = any(t in _NEGATIONS for t in tokens[max(0, i - 3):i])
|
| 155 |
+
if tok in _POSITIVE:
|
| 156 |
+
score += -1 if window_neg else 1
|
| 157 |
+
elif tok in _NEGATIVE:
|
| 158 |
+
score += 1 if window_neg else -1
|
| 159 |
+
|
| 160 |
+
# Map integer score β label
|
| 161 |
+
if score > 0:
|
| 162 |
+
label = "positive"
|
| 163 |
+
elif score < 0:
|
| 164 |
+
label = "negative"
|
| 165 |
+
else:
|
| 166 |
+
label = "neutral"
|
| 167 |
+
|
| 168 |
+
# Confidence-like mapping: squash by arctan-ish shape without math imports
|
| 169 |
+
# Clamp |score| to 6 β conf in ~[0.55, 0.95]
|
| 170 |
+
magnitude = min(abs(score), 6)
|
| 171 |
+
conf = 0.5 + (magnitude / 6) * 0.45 # 0.5..0.95
|
| 172 |
+
|
| 173 |
+
raw = {"engine": "heuristic", "score_raw": score, "note": note} if note else {"engine": "heuristic", "score_raw": score}
|
| 174 |
+
return SentimentResult(label=label, score=round(conf, 3), backend="local", raw=raw)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# ---------------------------
|
| 178 |
+
# Convenience (module-level)
|
| 179 |
+
# ---------------------------
|
| 180 |
+
|
| 181 |
+
def sentiment_label(text: str) -> str:
|
| 182 |
+
"""Return only 'positive' | 'neutral' | 'negative'."""
|
| 183 |
+
return analyze_sentiment(text).label
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def sentiment_score(text: str) -> float:
|
| 187 |
+
"""Return only the 0..1 confidence-like score."""
|
| 188 |
+
return analyze_sentiment(text).score
|
logged_in_bot/tools.py
CHANGED
|
@@ -1 +1,225 @@
|
|
| 1 |
# /logged_in_bot/tools.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /logged_in_bot/tools.py
|
| 2 |
+
"""
|
| 3 |
+
Utilities for the logged-in chatbot flow.
|
| 4 |
+
|
| 5 |
+
Features
|
| 6 |
+
- PII redaction (optional) via guardrails.pii_redaction
|
| 7 |
+
- Sentiment (optional) via logged_in_bot.sentiment_azure (falls back locally)
|
| 8 |
+
- Tiny intent router: summarize | echo | chat
|
| 9 |
+
- Deterministic, dependency-light; safe to import in any environment
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
from dataclasses import asdict, dataclass
|
| 14 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 15 |
+
import os
|
| 16 |
+
import re
|
| 17 |
+
|
| 18 |
+
# -------------------------
|
| 19 |
+
# Optional imports (safe)
|
| 20 |
+
# -------------------------
|
| 21 |
+
|
| 22 |
+
# Sentiment (ours): falls back to a local heuristic if Azure SDK/env missing
|
| 23 |
+
try:
|
| 24 |
+
from .sentiment_azure import analyze_sentiment, SentimentResult # type: ignore
|
| 25 |
+
except Exception: # pragma: no cover
|
| 26 |
+
analyze_sentiment = None
|
| 27 |
+
SentimentResult = None # type: ignore
|
| 28 |
+
|
| 29 |
+
# Guardrails redaction (optional)
|
| 30 |
+
try:
|
| 31 |
+
from guardrails.pii_redaction import redact as pii_redact # type: ignore
|
| 32 |
+
except Exception: # pragma: no cover
|
| 33 |
+
pii_redact = None
|
| 34 |
+
|
| 35 |
+
# core types (optional shape for JSON response)
|
| 36 |
+
try:
|
| 37 |
+
from core.types import PlainChatResponse # dataclass with .to_dict()
|
| 38 |
+
except Exception: # pragma: no cover
|
| 39 |
+
@dataclass
|
| 40 |
+
class PlainChatResponse: # lightweight fallback shape
|
| 41 |
+
reply: str
|
| 42 |
+
meta: Optional[Dict[str, Any]] = None
|
| 43 |
+
|
| 44 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 45 |
+
return asdict(self)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
History = List[Tuple[str, str]] # [("user","..."), ("bot","...")]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# -------------------------
|
| 52 |
+
# Helpers
|
| 53 |
+
# -------------------------
|
| 54 |
+
|
| 55 |
+
_WHITESPACE_RE = re.compile(r"\s+")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def sanitize_text(text: str) -> str:
|
| 59 |
+
"""Basic sanitize/normalize; keep CPU-cheap & deterministic."""
|
| 60 |
+
text = (text or "").strip()
|
| 61 |
+
text = _WHITESPACE_RE.sub(" ", text)
|
| 62 |
+
# Optionally cap extremely large payloads to protect inference/services
|
| 63 |
+
max_len = int(os.getenv("MAX_INPUT_CHARS", "4000"))
|
| 64 |
+
if len(text) > max_len:
|
| 65 |
+
text = text[:max_len] + "β¦"
|
| 66 |
+
return text
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def redact_text(text: str) -> str:
|
| 70 |
+
"""Apply optional PII redaction if available; otherwise return text."""
|
| 71 |
+
if pii_redact:
|
| 72 |
+
try:
|
| 73 |
+
return pii_redact(text)
|
| 74 |
+
except Exception:
|
| 75 |
+
# Fail open but safe
|
| 76 |
+
return text
|
| 77 |
+
return text
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def intent_of(text: str) -> str:
|
| 81 |
+
"""Ultra-tiny intent: summarize|echo|help|chat."""
|
| 82 |
+
t = text.lower().strip()
|
| 83 |
+
if not t:
|
| 84 |
+
return "empty"
|
| 85 |
+
if t.startswith("summarize ") or t.startswith("summarise ") or " summarize " in f" {t} ":
|
| 86 |
+
return "summarize"
|
| 87 |
+
if t.startswith("echo "):
|
| 88 |
+
return "echo"
|
| 89 |
+
if t in {"help", "/help", "capabilities"}:
|
| 90 |
+
return "help"
|
| 91 |
+
return "chat"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def summarize_text(text: str, target_len: int = 120) -> str:
|
| 95 |
+
"""
|
| 96 |
+
CPU-cheap pseudo-summarizer:
|
| 97 |
+
- Extract first sentence; if long, truncate to target_len with ellipsis.
|
| 98 |
+
Later you can swap this for a real HF model while keeping the same API.
|
| 99 |
+
"""
|
| 100 |
+
# naive sentence boundary
|
| 101 |
+
m = re.split(r"(?<=[.!?])\s+", text.strip())
|
| 102 |
+
first = m[0] if m else text.strip()
|
| 103 |
+
if len(first) <= target_len:
|
| 104 |
+
return first
|
| 105 |
+
return first[: target_len - 1].rstrip() + "β¦"
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def capabilities() -> List[str]:
|
| 109 |
+
return [
|
| 110 |
+
"help",
|
| 111 |
+
"echo <text>",
|
| 112 |
+
"summarize <paragraph>",
|
| 113 |
+
"sentiment tagging (logged-in mode)",
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# -------------------------
|
| 118 |
+
# Main entry
|
| 119 |
+
# -------------------------
|
| 120 |
+
|
| 121 |
+
def handle_logged_in_turn(message: str, history: Optional[History], user: Optional[dict]) -> Dict[str, Any]:
|
| 122 |
+
"""
|
| 123 |
+
Process one user turn in 'logged-in' mode.
|
| 124 |
+
|
| 125 |
+
Returns a PlainChatResponse (dict) with:
|
| 126 |
+
- reply: str
|
| 127 |
+
- meta: { intent, sentiment: {label, score, backend}, redacted: bool }
|
| 128 |
+
"""
|
| 129 |
+
history = history or []
|
| 130 |
+
user_text_raw = message or ""
|
| 131 |
+
user_text = sanitize_text(user_text_raw)
|
| 132 |
+
redacted = False
|
| 133 |
+
|
| 134 |
+
# Redact PII if available
|
| 135 |
+
redacted_text = redact_text(user_text)
|
| 136 |
+
redacted = (redacted_text != user_text)
|
| 137 |
+
|
| 138 |
+
it = intent_of(redacted_text)
|
| 139 |
+
|
| 140 |
+
# ---------- route ----------
|
| 141 |
+
if it == "empty":
|
| 142 |
+
reply = "Please type something. Try 'help' for options."
|
| 143 |
+
meta = _meta(redacted, it, redacted_text)
|
| 144 |
+
return PlainChatResponse(reply=reply, meta=meta).to_dict()
|
| 145 |
+
|
| 146 |
+
if it == "help":
|
| 147 |
+
reply = "I can:\n" + "\n".join(f"- {c}" for c in capabilities())
|
| 148 |
+
meta = _meta(redacted, it, redacted_text)
|
| 149 |
+
return PlainChatResponse(reply=reply, meta=meta).to_dict()
|
| 150 |
+
|
| 151 |
+
if it == "echo":
|
| 152 |
+
payload = redacted_text.split(" ", 1)[1] if " " in redacted_text else ""
|
| 153 |
+
reply = payload or "(nothing to echo)"
|
| 154 |
+
meta = _meta(redacted, it, redacted_text)
|
| 155 |
+
_attach_sentiment(meta, reply) # sentiment on reply text
|
| 156 |
+
return PlainChatResponse(reply=reply, meta=meta).to_dict()
|
| 157 |
+
|
| 158 |
+
if it == "summarize":
|
| 159 |
+
# Use everything after the keyword if present
|
| 160 |
+
if redacted_text.lower().startswith("summarize "):
|
| 161 |
+
payload = redacted_text.split(" ", 1)[1]
|
| 162 |
+
elif redacted_text.lower().startswith("summarise "):
|
| 163 |
+
payload = redacted_text.split(" ", 1)[1]
|
| 164 |
+
else:
|
| 165 |
+
payload = redacted_text
|
| 166 |
+
reply = summarize_text(payload)
|
| 167 |
+
meta = _meta(redacted, it, redacted_text)
|
| 168 |
+
_attach_sentiment(meta, payload) # sentiment on source text
|
| 169 |
+
return PlainChatResponse(reply=reply, meta=meta).to_dict()
|
| 170 |
+
|
| 171 |
+
# default: chat
|
| 172 |
+
reply = _chat_fallback(redacted_text, history)
|
| 173 |
+
meta = _meta(redacted, it, redacted_text)
|
| 174 |
+
_attach_sentiment(meta, redacted_text)
|
| 175 |
+
return PlainChatResponse(reply=reply, meta=meta).to_dict()
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# -------------------------
|
| 179 |
+
# Internals
|
| 180 |
+
# -------------------------
|
| 181 |
+
|
| 182 |
+
def _chat_fallback(text: str, history: History) -> str:
|
| 183 |
+
"""
|
| 184 |
+
Minimal deterministic fallback for general chat in logged-in mode.
|
| 185 |
+
Swap this for a provider call if/when you enable one.
|
| 186 |
+
"""
|
| 187 |
+
if "who are you" in text.lower():
|
| 188 |
+
return "I'm the logged-in chatbot. I can echo, summarize, and tag sentiment."
|
| 189 |
+
return "Noted! (logged-in mode). Type 'help' for options."
|
| 190 |
+
|
| 191 |
+
def _meta(redacted: bool, intent: str, redacted_text: str) -> Dict[str, Any]:
|
| 192 |
+
return {
|
| 193 |
+
"intent": intent,
|
| 194 |
+
"redacted": redacted,
|
| 195 |
+
"input_len": len(redacted_text),
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
def _attach_sentiment(meta: Dict[str, Any], text: str) -> None:
|
| 199 |
+
"""Attach sentiment to meta if available; never raises."""
|
| 200 |
+
try:
|
| 201 |
+
if analyze_sentiment:
|
| 202 |
+
res = analyze_sentiment(text)
|
| 203 |
+
if hasattr(res, "__dict__"):
|
| 204 |
+
meta["sentiment"] = {
|
| 205 |
+
"label": res.label,
|
| 206 |
+
"score": res.score,
|
| 207 |
+
"backend": res.backend,
|
| 208 |
+
}
|
| 209 |
+
else: # unexpected object β store string
|
| 210 |
+
meta["sentiment"] = {"label": str(res)}
|
| 211 |
+
else:
|
| 212 |
+
# no module available
|
| 213 |
+
meta["sentiment"] = {"label": "neutral", "score": 0.5, "backend": "none"}
|
| 214 |
+
except Exception as e: # pragma: no cover
|
| 215 |
+
meta["sentiment"] = {"error": f"{type(e).__name__}: {e}"}
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
__all__ = [
|
| 219 |
+
"handle_logged_in_turn",
|
| 220 |
+
"sanitize_text",
|
| 221 |
+
"redact_text",
|
| 222 |
+
"intent_of",
|
| 223 |
+
"summarize_text",
|
| 224 |
+
"capabilities",
|
| 225 |
+
]
|
memory/rag/indexer.py
CHANGED
|
@@ -1 +1,344 @@
|
|
| 1 |
# /memory/rag/data/indexer.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /memory/rag/data/indexer.py
|
| 2 |
+
"""
|
| 3 |
+
Minimal, dependency-free TF-IDF indexer for RAG.
|
| 4 |
+
|
| 5 |
+
Features
|
| 6 |
+
- Build from folder (recursive), index plain-text files
|
| 7 |
+
- Add individual text blobs with metadata
|
| 8 |
+
- Persist/load inverted index to/from JSON
|
| 9 |
+
- Search with TF-IDF scoring and simple query normalization
|
| 10 |
+
- Return top-k with tiny context snippets
|
| 11 |
+
|
| 12 |
+
This module is intentionally small and pure-Python to keep local CPU demos simple.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
from dataclasses import dataclass, asdict
|
| 17 |
+
from typing import Dict, List, Tuple, Iterable, Optional
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import json
|
| 20 |
+
import math
|
| 21 |
+
import hashlib
|
| 22 |
+
import re
|
| 23 |
+
import fnmatch
|
| 24 |
+
import time
|
| 25 |
+
|
| 26 |
+
# -----------------------------
|
| 27 |
+
# Types
|
| 28 |
+
# -----------------------------
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class DocMeta:
|
| 32 |
+
doc_id: str
|
| 33 |
+
source: str # e.g., absolute path or "inline"
|
| 34 |
+
title: str | None = None
|
| 35 |
+
tags: List[str] | None = None
|
| 36 |
+
mtime: float | None = None # source last modified (if file)
|
| 37 |
+
hash: str | None = None # content hash
|
| 38 |
+
|
| 39 |
+
@dataclass(frozen=True)
|
| 40 |
+
class Hit:
|
| 41 |
+
doc_id: str
|
| 42 |
+
score: float
|
| 43 |
+
source: str
|
| 44 |
+
snippet: str
|
| 45 |
+
title: str | None = None
|
| 46 |
+
tags: List[str] | None = None
|
| 47 |
+
|
| 48 |
+
# -----------------------------
|
| 49 |
+
# Tokenization
|
| 50 |
+
# -----------------------------
|
| 51 |
+
|
| 52 |
+
_WORD_RE = re.compile(r"[A-Za-z0-9']+")
|
| 53 |
+
|
| 54 |
+
def tokenize(text: str) -> List[str]:
|
| 55 |
+
# simple, deterministic tokenizer; lowercased
|
| 56 |
+
return [t.lower() for t in _WORD_RE.findall(text or "")]
|
| 57 |
+
|
| 58 |
+
# -----------------------------
|
| 59 |
+
# Index
|
| 60 |
+
# -----------------------------
|
| 61 |
+
|
| 62 |
+
class TfidfIndex:
|
| 63 |
+
"""
|
| 64 |
+
Tiny TF-IDF inverted index with JSON persistence.
|
| 65 |
+
|
| 66 |
+
Structures:
|
| 67 |
+
- docs: doc_id -> {"meta": DocMeta, "len": int, "text": str (optional)}
|
| 68 |
+
- inv: term -> {doc_id: tf} (raw term frequency)
|
| 69 |
+
- df: term -> document frequency
|
| 70 |
+
- n_docs: total number of docs
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def __init__(self) -> None:
|
| 74 |
+
self.docs: Dict[str, Dict] = {}
|
| 75 |
+
self.inv: Dict[str, Dict[str, int]] = {}
|
| 76 |
+
self.df: Dict[str, int] = {}
|
| 77 |
+
self.n_docs: int = 0
|
| 78 |
+
|
| 79 |
+
# ---------- add documents ----------
|
| 80 |
+
|
| 81 |
+
def add_text(self, doc_id: str, text: str, meta: DocMeta) -> None:
|
| 82 |
+
if not text:
|
| 83 |
+
return
|
| 84 |
+
if doc_id in self.docs:
|
| 85 |
+
# idempotent update: remove old postings first
|
| 86 |
+
self._remove_doc_terms(doc_id)
|
| 87 |
+
|
| 88 |
+
toks = tokenize(text)
|
| 89 |
+
if not toks:
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
tf: Dict[str, int] = {}
|
| 93 |
+
for t in toks:
|
| 94 |
+
tf[t] = tf.get(t, 0) + 1
|
| 95 |
+
|
| 96 |
+
# update inv + df
|
| 97 |
+
for term, cnt in tf.items():
|
| 98 |
+
bucket = self.inv.setdefault(term, {})
|
| 99 |
+
bucket[doc_id] = cnt
|
| 100 |
+
self.df[term] = len(bucket)
|
| 101 |
+
|
| 102 |
+
self.docs[doc_id] = {
|
| 103 |
+
"meta": meta,
|
| 104 |
+
"len": len(toks),
|
| 105 |
+
# keep original text for snippet extraction; you can drop this if size matters
|
| 106 |
+
"text": text,
|
| 107 |
+
}
|
| 108 |
+
self.n_docs = len(self.docs)
|
| 109 |
+
|
| 110 |
+
def add_file(self, path: Path, doc_id: str | None = None, title: str | None = None, tags: List[str] | None = None) -> Optional[str]:
|
| 111 |
+
path = Path(path)
|
| 112 |
+
if not path.is_file():
|
| 113 |
+
return None
|
| 114 |
+
text = path.read_text(encoding="utf-8", errors="ignore")
|
| 115 |
+
h = sha256_of(text)
|
| 116 |
+
stat = path.stat()
|
| 117 |
+
doc_id = doc_id or str(path.resolve())
|
| 118 |
+
|
| 119 |
+
# skip if unchanged
|
| 120 |
+
prev = self.docs.get(doc_id)
|
| 121 |
+
if prev:
|
| 122 |
+
old_meta: DocMeta = prev["meta"]
|
| 123 |
+
if old_meta.hash == h and old_meta.mtime == stat.st_mtime:
|
| 124 |
+
return doc_id # unchanged
|
| 125 |
+
|
| 126 |
+
meta = DocMeta(
|
| 127 |
+
doc_id=doc_id,
|
| 128 |
+
source=str(path.resolve()),
|
| 129 |
+
title=title or path.name,
|
| 130 |
+
tags=tags,
|
| 131 |
+
mtime=stat.st_mtime,
|
| 132 |
+
hash=h,
|
| 133 |
+
)
|
| 134 |
+
self.add_text(doc_id, text, meta)
|
| 135 |
+
return doc_id
|
| 136 |
+
|
| 137 |
+
# ---------- build / scan ----------
|
| 138 |
+
|
| 139 |
+
def build_from_folder(
|
| 140 |
+
self,
|
| 141 |
+
root: Path,
|
| 142 |
+
include: Iterable[str] = ("*.txt", "*.md"),
|
| 143 |
+
exclude: Iterable[str] = (".git/*",),
|
| 144 |
+
recursive: bool = True,
|
| 145 |
+
) -> int:
|
| 146 |
+
"""
|
| 147 |
+
Index all files under `root` matching any include pattern and not matching exclude.
|
| 148 |
+
Returns number of files indexed or updated.
|
| 149 |
+
"""
|
| 150 |
+
root = Path(root)
|
| 151 |
+
if not root.exists():
|
| 152 |
+
return 0
|
| 153 |
+
|
| 154 |
+
count = 0
|
| 155 |
+
paths = (root.rglob("*") if recursive else root.glob("*"))
|
| 156 |
+
for p in paths:
|
| 157 |
+
if not p.is_file():
|
| 158 |
+
continue
|
| 159 |
+
rel = str(p.relative_to(root).as_posix())
|
| 160 |
+
if not any(fnmatch.fnmatch(rel, pat) for pat in include):
|
| 161 |
+
continue
|
| 162 |
+
if any(fnmatch.fnmatch(rel, pat) for pat in exclude):
|
| 163 |
+
continue
|
| 164 |
+
if self.add_file(p):
|
| 165 |
+
count += 1
|
| 166 |
+
return count
|
| 167 |
+
|
| 168 |
+
# ---------- search ----------
|
| 169 |
+
|
| 170 |
+
def search(self, query: str, k: int = 5) -> List[Hit]:
|
| 171 |
+
q_toks = tokenize(query)
|
| 172 |
+
if not q_toks or self.n_docs == 0:
|
| 173 |
+
return []
|
| 174 |
+
|
| 175 |
+
# compute query tf-idf (using binary or raw tf is fine; keep it simple)
|
| 176 |
+
q_tf: Dict[str, int] = {}
|
| 177 |
+
for t in q_toks:
|
| 178 |
+
q_tf[t] = q_tf.get(t, 0) + 1
|
| 179 |
+
|
| 180 |
+
# compute idf with +1 smoothing
|
| 181 |
+
idf: Dict[str, float] = {}
|
| 182 |
+
for t in q_tf:
|
| 183 |
+
df = self.df.get(t, 0)
|
| 184 |
+
idf[t] = math.log((1 + self.n_docs) / (1 + df)) + 1.0
|
| 185 |
+
|
| 186 |
+
# accumulate scores: cosine-like with length norm
|
| 187 |
+
scores: Dict[str, float] = {}
|
| 188 |
+
doc_len_norm: Dict[str, float] = {}
|
| 189 |
+
for term, qcnt in q_tf.items():
|
| 190 |
+
postings = self.inv.get(term)
|
| 191 |
+
if not postings:
|
| 192 |
+
continue
|
| 193 |
+
wq = (1 + math.log(qcnt)) * idf[term] # log tf * idf
|
| 194 |
+
for doc_id, dcnt in postings.items():
|
| 195 |
+
wd = (1 + math.log(dcnt)) * idf[term]
|
| 196 |
+
scores[doc_id] = scores.get(doc_id, 0.0) + (wq * wd)
|
| 197 |
+
# cache norm
|
| 198 |
+
if doc_id not in doc_len_norm:
|
| 199 |
+
L = max(1, self.docs[doc_id]["len"])
|
| 200 |
+
doc_len_norm[doc_id] = 1.0 / math.sqrt(L)
|
| 201 |
+
|
| 202 |
+
# apply a gentle length normalization
|
| 203 |
+
for d, s in list(scores.items()):
|
| 204 |
+
scores[d] = s * doc_len_norm.get(d, 1.0)
|
| 205 |
+
|
| 206 |
+
# rank and format
|
| 207 |
+
ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:k]
|
| 208 |
+
hits: List[Hit] = []
|
| 209 |
+
for doc_id, score in ranked:
|
| 210 |
+
d = self.docs[doc_id]
|
| 211 |
+
meta: DocMeta = d["meta"]
|
| 212 |
+
snippet = make_snippet(d.get("text", ""), q_toks)
|
| 213 |
+
hits.append(Hit(
|
| 214 |
+
doc_id=doc_id,
|
| 215 |
+
score=round(float(score), 4),
|
| 216 |
+
source=meta.source,
|
| 217 |
+
snippet=snippet,
|
| 218 |
+
title=meta.title,
|
| 219 |
+
tags=meta.tags,
|
| 220 |
+
))
|
| 221 |
+
return hits
|
| 222 |
+
|
| 223 |
+
# ---------- persistence ----------
|
| 224 |
+
|
| 225 |
+
def save(self, path: Path) -> None:
|
| 226 |
+
path = Path(path)
|
| 227 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 228 |
+
# Store meta as dict to keep JSON serializable
|
| 229 |
+
serial_docs = {
|
| 230 |
+
doc_id: {
|
| 231 |
+
"meta": asdict(d["meta"]),
|
| 232 |
+
"len": d["len"],
|
| 233 |
+
# store text to allow snippet generation after load (optional)
|
| 234 |
+
"text": d.get("text", ""),
|
| 235 |
+
}
|
| 236 |
+
for doc_id, d in self.docs.items()
|
| 237 |
+
}
|
| 238 |
+
data = {
|
| 239 |
+
"docs": serial_docs,
|
| 240 |
+
"inv": self.inv,
|
| 241 |
+
"df": self.df,
|
| 242 |
+
"n_docs": self.n_docs,
|
| 243 |
+
"saved_at": time.time(),
|
| 244 |
+
}
|
| 245 |
+
path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
|
| 246 |
+
|
| 247 |
+
@classmethod
|
| 248 |
+
def load(cls, path: Path) -> "TfidfIndex":
|
| 249 |
+
path = Path(path)
|
| 250 |
+
idx = cls()
|
| 251 |
+
if not path.is_file():
|
| 252 |
+
return idx
|
| 253 |
+
data = json.loads(path.read_text(encoding="utf-8"))
|
| 254 |
+
# reconstruct docs with DocMeta
|
| 255 |
+
docs: Dict[str, Dict] = {}
|
| 256 |
+
for doc_id, d in data.get("docs", {}).items():
|
| 257 |
+
m = d.get("meta", {})
|
| 258 |
+
meta = DocMeta(**m) if m else DocMeta(doc_id=doc_id, source="unknown")
|
| 259 |
+
docs[doc_id] = {
|
| 260 |
+
"meta": meta,
|
| 261 |
+
"len": d.get("len", 0),
|
| 262 |
+
"text": d.get("text", ""),
|
| 263 |
+
}
|
| 264 |
+
idx.docs = docs
|
| 265 |
+
idx.inv = {t: {k: int(v) for k, v in postings.items()} for t, postings in data.get("inv", {}).items()}
|
| 266 |
+
idx.df = {t: int(v) for t, v in data.get("df", {}).items()}
|
| 267 |
+
idx.n_docs = int(data.get("n_docs", len(idx.docs)))
|
| 268 |
+
return idx
|
| 269 |
+
|
| 270 |
+
# ---------- internals ----------
|
| 271 |
+
|
| 272 |
+
def _remove_doc_terms(self, doc_id: str) -> None:
|
| 273 |
+
"""Remove a document's postings before re-adding."""
|
| 274 |
+
if doc_id not in self.docs:
|
| 275 |
+
return
|
| 276 |
+
# delete postings
|
| 277 |
+
for term, postings in list(self.inv.items()):
|
| 278 |
+
if doc_id in postings:
|
| 279 |
+
postings.pop(doc_id, None)
|
| 280 |
+
if postings:
|
| 281 |
+
self.df[term] = len(postings)
|
| 282 |
+
else:
|
| 283 |
+
# remove empty term
|
| 284 |
+
self.inv.pop(term, None)
|
| 285 |
+
self.df.pop(term, None)
|
| 286 |
+
# delete doc
|
| 287 |
+
self.docs.pop(doc_id, None)
|
| 288 |
+
self.n_docs = len(self.docs)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
# -----------------------------
|
| 292 |
+
# Utilities
|
| 293 |
+
# -----------------------------
|
| 294 |
+
|
| 295 |
+
def sha256_of(text: str) -> str:
|
| 296 |
+
return hashlib.sha256((text or "").encode("utf-8")).hexdigest()
|
| 297 |
+
|
| 298 |
+
def make_snippet(text: str, query_tokens: List[str], radius: int = 60) -> str:
|
| 299 |
+
"""
|
| 300 |
+
Extract a tiny context window around the first matched token.
|
| 301 |
+
"""
|
| 302 |
+
if not text:
|
| 303 |
+
return ""
|
| 304 |
+
low = text.lower()
|
| 305 |
+
for qt in query_tokens:
|
| 306 |
+
i = low.find(qt.lower())
|
| 307 |
+
if i >= 0:
|
| 308 |
+
start = max(0, i - radius)
|
| 309 |
+
end = min(len(text), i + len(qt) + radius)
|
| 310 |
+
snippet = text[start:end].replace("\n", " ").strip()
|
| 311 |
+
if start > 0:
|
| 312 |
+
snippet = "β¦" + snippet
|
| 313 |
+
if end < len(text):
|
| 314 |
+
snippet = snippet + "β¦"
|
| 315 |
+
return snippet
|
| 316 |
+
# fallback: beginning of the doc
|
| 317 |
+
s = text[: 2 * radius].replace("\n", " ").strip()
|
| 318 |
+
return (s + "β¦") if len(text) > 2 * radius else s
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
# -----------------------------
|
| 322 |
+
# Convenience API (module-level)
|
| 323 |
+
# -----------------------------
|
| 324 |
+
|
| 325 |
+
DEFAULT_INDEX_PATH = Path("memory/rag/data/.index/tfidf_index.json")
|
| 326 |
+
|
| 327 |
+
def build_from_folder(
|
| 328 |
+
root: str | Path,
|
| 329 |
+
include: Iterable[str] = ("*.txt", "*.md"),
|
| 330 |
+
exclude: Iterable[str] = (".git/*",),
|
| 331 |
+
save_to: str | Path = DEFAULT_INDEX_PATH,
|
| 332 |
+
recursive: bool = True,
|
| 333 |
+
) -> TfidfIndex:
|
| 334 |
+
idx = TfidfIndex()
|
| 335 |
+
idx.build_from_folder(Path(root), include=include, exclude=exclude, recursive=recursive)
|
| 336 |
+
idx.save(Path(save_to))
|
| 337 |
+
return idx
|
| 338 |
+
|
| 339 |
+
def load_index(path: str | Path = DEFAULT_INDEX_PATH) -> TfidfIndex:
|
| 340 |
+
return TfidfIndex.load(Path(path))
|
| 341 |
+
|
| 342 |
+
def search(query: str, k: int = 5, path: str | Path = DEFAULT_INDEX_PATH) -> List[Hit]:
|
| 343 |
+
idx = load_index(path)
|
| 344 |
+
return idx.search(query, k=k)
|
memory/rag/retriever.py
CHANGED
|
@@ -1 +1,268 @@
|
|
| 1 |
# /memory/rag/data/retriever.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /memory/rag/data/retriever.py
|
| 2 |
+
"""
|
| 3 |
+
Minimal RAG retriever that sits on top of the TF-IDF indexer.
|
| 4 |
+
|
| 5 |
+
Features
|
| 6 |
+
- Top-k document retrieval via indexer.search()
|
| 7 |
+
- Optional filters (tags, title substring)
|
| 8 |
+
- Passage extraction around query terms with overlap
|
| 9 |
+
- Lightweight proximity-based reranking of passages
|
| 10 |
+
|
| 11 |
+
No third-party dependencies; pairs with memory/rag/data/indexer.py.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from typing import Dict, Iterable, List, Optional, Tuple
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
import math
|
| 19 |
+
import re
|
| 20 |
+
|
| 21 |
+
from .indexer import (
|
| 22 |
+
load_index,
|
| 23 |
+
search as index_search,
|
| 24 |
+
DEFAULT_INDEX_PATH,
|
| 25 |
+
tokenize,
|
| 26 |
+
TfidfIndex,
|
| 27 |
+
DocMeta,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# -----------------------------
|
| 31 |
+
# Public types
|
| 32 |
+
# -----------------------------
|
| 33 |
+
|
| 34 |
+
@dataclass(frozen=True)
|
| 35 |
+
class Passage:
|
| 36 |
+
doc_id: str
|
| 37 |
+
source: str
|
| 38 |
+
title: Optional[str]
|
| 39 |
+
tags: Optional[List[str]]
|
| 40 |
+
score: float # combined score (index score +/- rerank)
|
| 41 |
+
start: int # char start in original text
|
| 42 |
+
end: int # char end in original text
|
| 43 |
+
text: str # extracted passage
|
| 44 |
+
snippet: str # human-friendly short snippet (may equal text if short)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass(frozen=True)
|
| 48 |
+
class Filters:
|
| 49 |
+
title_contains: Optional[str] = None # case-insensitive containment
|
| 50 |
+
require_tags: Optional[Iterable[str]] = None # all tags must be present (AND)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# -----------------------------
|
| 54 |
+
# Retrieval API
|
| 55 |
+
# -----------------------------
|
| 56 |
+
|
| 57 |
+
def retrieve(
|
| 58 |
+
query: str,
|
| 59 |
+
k: int = 5,
|
| 60 |
+
index_path: str | Path = DEFAULT_INDEX_PATH,
|
| 61 |
+
filters: Optional[Filters] = None,
|
| 62 |
+
passage_chars: int = 350,
|
| 63 |
+
passage_overlap: int = 60,
|
| 64 |
+
enable_rerank: bool = True,
|
| 65 |
+
) -> List[Passage]:
|
| 66 |
+
"""
|
| 67 |
+
Retrieve top-k passages for a query.
|
| 68 |
+
|
| 69 |
+
Steps:
|
| 70 |
+
1. Run TF-IDF doc search
|
| 71 |
+
2. Apply optional filters
|
| 72 |
+
3. Extract a focused passage per doc
|
| 73 |
+
4. (Optional) Rerank by term proximity within the passage
|
| 74 |
+
"""
|
| 75 |
+
idx = load_index(index_path)
|
| 76 |
+
if idx.n_docs == 0 or not query.strip():
|
| 77 |
+
return []
|
| 78 |
+
|
| 79 |
+
# initial doc hits
|
| 80 |
+
hits = index_search(query, k=max(k * 3, k), path=index_path) # overshoot; filter + rerank will trim
|
| 81 |
+
|
| 82 |
+
# filter hits by title/tags if requested
|
| 83 |
+
if filters:
|
| 84 |
+
hits = _apply_filters(hits, idx, filters)
|
| 85 |
+
|
| 86 |
+
# extract best passage per remaining doc
|
| 87 |
+
q_tokens = tokenize(query)
|
| 88 |
+
passages: List[Passage] = []
|
| 89 |
+
for h in hits:
|
| 90 |
+
doc = idx.docs.get(h.doc_id)
|
| 91 |
+
if not doc:
|
| 92 |
+
continue
|
| 93 |
+
meta: DocMeta = doc["meta"]
|
| 94 |
+
full_text: str = doc.get("text", "") or ""
|
| 95 |
+
start, end, passage_text = _extract_passage(full_text, q_tokens, window=passage_chars, overlap=passage_overlap)
|
| 96 |
+
snippet = passage_text if len(passage_text) <= 220 else passage_text[:220].rstrip() + "β¦"
|
| 97 |
+
passages.append(Passage(
|
| 98 |
+
doc_id=h.doc_id,
|
| 99 |
+
source=meta.source,
|
| 100 |
+
title=meta.title,
|
| 101 |
+
tags=meta.tags,
|
| 102 |
+
score=float(h.score), # base score from index
|
| 103 |
+
start=start,
|
| 104 |
+
end=end,
|
| 105 |
+
text=passage_text,
|
| 106 |
+
snippet=snippet,
|
| 107 |
+
))
|
| 108 |
+
|
| 109 |
+
if not passages:
|
| 110 |
+
return []
|
| 111 |
+
|
| 112 |
+
# optional rerank by proximity of query terms inside the passage
|
| 113 |
+
if enable_rerank:
|
| 114 |
+
passages = _rerank_by_proximity(passages, q_tokens)
|
| 115 |
+
|
| 116 |
+
# final top-k
|
| 117 |
+
passages.sort(key=lambda p: p.score, reverse=True)
|
| 118 |
+
return passages[:k]
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def retrieve_texts(
|
| 122 |
+
query: str,
|
| 123 |
+
k: int = 5,
|
| 124 |
+
**kwargs,
|
| 125 |
+
) -> List[str]:
|
| 126 |
+
"""
|
| 127 |
+
Convenience: return only the passage texts for a query.
|
| 128 |
+
"""
|
| 129 |
+
return [p.text for p in retrieve(query, k=k, **kwargs)]
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# -----------------------------
|
| 133 |
+
# Internals
|
| 134 |
+
# -----------------------------
|
| 135 |
+
|
| 136 |
+
def _apply_filters(hits, idx: TfidfIndex, filters: Filters):
|
| 137 |
+
out = []
|
| 138 |
+
want_title = (filters.title_contains or "").strip().lower() or None
|
| 139 |
+
want_tags = set(t.strip().lower() for t in (filters.require_tags or []) if str(t).strip())
|
| 140 |
+
|
| 141 |
+
for h in hits:
|
| 142 |
+
d = idx.docs.get(h.doc_id)
|
| 143 |
+
if not d:
|
| 144 |
+
continue
|
| 145 |
+
meta: DocMeta = d["meta"]
|
| 146 |
+
|
| 147 |
+
if want_title:
|
| 148 |
+
t = (meta.title or "").lower()
|
| 149 |
+
if want_title not in t:
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
if want_tags:
|
| 153 |
+
tags = set((meta.tags or []))
|
| 154 |
+
tags = set(x.lower() for x in tags)
|
| 155 |
+
if not want_tags.issubset(tags):
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
out.append(h)
|
| 159 |
+
return out
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
_WORD_RE = re.compile(r"[A-Za-z0-9']+")
|
| 163 |
+
|
| 164 |
+
def _find_all(term: str, text: str) -> List[int]:
|
| 165 |
+
"""Return starting indices of all case-insensitive matches of term in text."""
|
| 166 |
+
if not term or not text:
|
| 167 |
+
return []
|
| 168 |
+
term_l = term.lower()
|
| 169 |
+
low = text.lower()
|
| 170 |
+
out: List[int] = []
|
| 171 |
+
i = low.find(term_l)
|
| 172 |
+
while i >= 0:
|
| 173 |
+
out.append(i)
|
| 174 |
+
i = low.find(term_l, i + 1)
|
| 175 |
+
return out
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _extract_passage(text: str, q_tokens: List[str], window: int = 350, overlap: int = 60) -> Tuple[int, int, str]:
|
| 179 |
+
"""
|
| 180 |
+
Pick a passage around the earliest match of any query token.
|
| 181 |
+
If no match found, return the first window.
|
| 182 |
+
"""
|
| 183 |
+
if not text:
|
| 184 |
+
return 0, 0, ""
|
| 185 |
+
|
| 186 |
+
low = text.lower()
|
| 187 |
+
# choose the earliest hit among query tokens
|
| 188 |
+
hit_positions: List[int] = []
|
| 189 |
+
for qt in q_tokens:
|
| 190 |
+
hit_positions.extend(_find_all(qt, text))
|
| 191 |
+
start: int
|
| 192 |
+
end: int
|
| 193 |
+
|
| 194 |
+
if hit_positions:
|
| 195 |
+
i = max(0, min(hit_positions) - overlap)
|
| 196 |
+
start = i
|
| 197 |
+
end = min(len(text), start + window)
|
| 198 |
+
else:
|
| 199 |
+
start = 0
|
| 200 |
+
end = min(len(text), window)
|
| 201 |
+
|
| 202 |
+
return start, end, text[start:end].strip()
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def _rerank_by_proximity(passages: List[Passage], q_tokens: List[str]) -> List[Passage]:
|
| 206 |
+
"""
|
| 207 |
+
Adjust scores based on how tightly query tokens cluster inside the passage.
|
| 208 |
+
Heuristic:
|
| 209 |
+
- For each unique query token, find all positions in the passage (word indices).
|
| 210 |
+
- Compute average pairwise distance among the closest occurrences.
|
| 211 |
+
- Convert to a bonus in [0, 0.25] and add to base score.
|
| 212 |
+
"""
|
| 213 |
+
q_unique = [t for t in dict.fromkeys(q_tokens)] # preserve order, dedupe
|
| 214 |
+
if not q_unique:
|
| 215 |
+
return passages
|
| 216 |
+
|
| 217 |
+
def word_positions(text: str, term: str) -> List[int]:
|
| 218 |
+
# word-level positions for term
|
| 219 |
+
positions: List[int] = []
|
| 220 |
+
words = [w.group(0).lower() for w in _WORD_RE.finditer(text)]
|
| 221 |
+
for i, w in enumerate(words):
|
| 222 |
+
if term == w:
|
| 223 |
+
positions.append(i)
|
| 224 |
+
return positions
|
| 225 |
+
|
| 226 |
+
def proximity_bonus(p: Passage) -> float:
|
| 227 |
+
# collect positions per term
|
| 228 |
+
pos_lists = [word_positions(p.text, t) for t in q_unique]
|
| 229 |
+
if all(len(ps) == 0 for ps in pos_lists):
|
| 230 |
+
return 0.0
|
| 231 |
+
|
| 232 |
+
# flatten a representative set of positions (closest aligned indices)
|
| 233 |
+
reps: List[int] = []
|
| 234 |
+
for ps in pos_lists:
|
| 235 |
+
reps.append(ps[0] if ps else 999999)
|
| 236 |
+
|
| 237 |
+
# average absolute distance to the median position
|
| 238 |
+
med = sorted([x for x in reps if x != 999999])
|
| 239 |
+
if not med:
|
| 240 |
+
return 0.0
|
| 241 |
+
mid = med[len(med) // 2]
|
| 242 |
+
avg_dist = sum(abs((x if x != 999999 else mid) - mid) for x in reps) / max(1, len(reps))
|
| 243 |
+
|
| 244 |
+
# squash distance β bonus; closer = bigger bonus
|
| 245 |
+
# dist 0 β 0.25 bonus; dist 10+ β ~0 bonus
|
| 246 |
+
bonus = max(0.0, 0.25 * (1.0 - min(avg_dist, 10.0) / 10.0))
|
| 247 |
+
return float(bonus)
|
| 248 |
+
|
| 249 |
+
reranked: List[Passage] = []
|
| 250 |
+
for p in passages:
|
| 251 |
+
bonus = proximity_bonus(p)
|
| 252 |
+
reranked.append(Passage(
|
| 253 |
+
**{**p.__dict__, "score": p.score + bonus}
|
| 254 |
+
))
|
| 255 |
+
return reranked
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# -----------------------------
|
| 259 |
+
# CLI / quick test
|
| 260 |
+
# -----------------------------
|
| 261 |
+
|
| 262 |
+
if __name__ == "__main__":
|
| 263 |
+
import sys
|
| 264 |
+
q = " ".join(sys.argv[1:]) or "anonymous chatbot rules"
|
| 265 |
+
out = retrieve(q, k=3)
|
| 266 |
+
for i, p in enumerate(out, 1):
|
| 267 |
+
print(f"[{i}] {p.score:.4f} {p.title or '(untitled)'} β {p.source}")
|
| 268 |
+
print(" ", (p.snippet.replace("\n", " ") if p.snippet else "")[:200])
|
memory/sessions.py
CHANGED
|
@@ -1 +1,244 @@
|
|
| 1 |
# /memory/sessions.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /memory/sessions.py
|
| 2 |
+
"""
|
| 3 |
+
Minimal session store for chat history + per-session data.
|
| 4 |
+
|
| 5 |
+
Features
|
| 6 |
+
- In-memory store with thread safety
|
| 7 |
+
- Create/get/update/delete sessions
|
| 8 |
+
- Append chat turns: ("user"| "bot", text)
|
| 9 |
+
- Optional TTL cleanup and max-history cap
|
| 10 |
+
- JSON persistence (save/load)
|
| 11 |
+
- Deterministic, dependency-free
|
| 12 |
+
|
| 13 |
+
Intended to interoperate with anon_bot and logged_in_bot:
|
| 14 |
+
- History shape: List[Tuple[str, str]] e.g., [("user","hi"), ("bot","hello")]
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
from dataclasses import dataclass, asdict, field
|
| 19 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
import time
|
| 22 |
+
import uuid
|
| 23 |
+
import json
|
| 24 |
+
import threading
|
| 25 |
+
|
| 26 |
+
History = List[Tuple[str, str]] # [("user","..."), ("bot","...")]
|
| 27 |
+
|
| 28 |
+
# -----------------------------
|
| 29 |
+
# Data model
|
| 30 |
+
# -----------------------------
|
| 31 |
+
|
| 32 |
+
@dataclass
|
| 33 |
+
class Session:
|
| 34 |
+
session_id: str
|
| 35 |
+
user_id: Optional[str] = None
|
| 36 |
+
created_at: float = field(default_factory=lambda: time.time())
|
| 37 |
+
updated_at: float = field(default_factory=lambda: time.time())
|
| 38 |
+
data: Dict[str, Any] = field(default_factory=dict) # arbitrary per-session state
|
| 39 |
+
history: History = field(default_factory=list) # chat transcripts
|
| 40 |
+
|
| 41 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 42 |
+
d = asdict(self)
|
| 43 |
+
# dataclasses with tuples serialize fine, ensure tuples not lost if reloaded
|
| 44 |
+
return d
|
| 45 |
+
|
| 46 |
+
@staticmethod
|
| 47 |
+
def from_dict(d: Dict[str, Any]) -> "Session":
|
| 48 |
+
s = Session(
|
| 49 |
+
session_id=d["session_id"],
|
| 50 |
+
user_id=d.get("user_id"),
|
| 51 |
+
created_at=float(d.get("created_at", time.time())),
|
| 52 |
+
updated_at=float(d.get("updated_at", time.time())),
|
| 53 |
+
data=dict(d.get("data", {})),
|
| 54 |
+
history=[(str(who), str(text)) for who, text in d.get("history", [])],
|
| 55 |
+
)
|
| 56 |
+
return s
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# -----------------------------
|
| 60 |
+
# Store
|
| 61 |
+
# -----------------------------
|
| 62 |
+
|
| 63 |
+
class SessionStore:
|
| 64 |
+
"""
|
| 65 |
+
Thread-safe in-memory session registry with optional TTL and persistence.
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
def __init__(
|
| 69 |
+
self,
|
| 70 |
+
ttl_seconds: Optional[int] = 60 * 60, # 1 hour default; set None to disable
|
| 71 |
+
max_history: int = 200, # cap messages per session
|
| 72 |
+
) -> None:
|
| 73 |
+
self._ttl = ttl_seconds
|
| 74 |
+
self._max_history = max_history
|
| 75 |
+
self._lock = threading.RLock()
|
| 76 |
+
self._sessions: Dict[str, Session] = {}
|
| 77 |
+
|
| 78 |
+
# ---- id helpers ----
|
| 79 |
+
|
| 80 |
+
@staticmethod
|
| 81 |
+
def new_id() -> str:
|
| 82 |
+
return uuid.uuid4().hex
|
| 83 |
+
|
| 84 |
+
# ---- CRUD ----
|
| 85 |
+
|
| 86 |
+
def create(self, user_id: Optional[str] = None, session_id: Optional[str] = None) -> Session:
|
| 87 |
+
with self._lock:
|
| 88 |
+
sid = session_id or self.new_id()
|
| 89 |
+
s = Session(session_id=sid, user_id=user_id)
|
| 90 |
+
self._sessions[sid] = s
|
| 91 |
+
return s
|
| 92 |
+
|
| 93 |
+
def get(self, session_id: str, create_if_missing: bool = False, user_id: Optional[str] = None) -> Optional[Session]:
|
| 94 |
+
with self._lock:
|
| 95 |
+
s = self._sessions.get(session_id)
|
| 96 |
+
if s is None and create_if_missing:
|
| 97 |
+
s = self.create(user_id=user_id, session_id=session_id)
|
| 98 |
+
return s
|
| 99 |
+
|
| 100 |
+
def delete(self, session_id: str) -> bool:
|
| 101 |
+
with self._lock:
|
| 102 |
+
return self._sessions.pop(session_id, None) is not None
|
| 103 |
+
|
| 104 |
+
def all_ids(self) -> List[str]:
|
| 105 |
+
with self._lock:
|
| 106 |
+
return list(self._sessions.keys())
|
| 107 |
+
|
| 108 |
+
# ---- housekeeping ----
|
| 109 |
+
|
| 110 |
+
def _expired(self, s: Session) -> bool:
|
| 111 |
+
if self._ttl is None:
|
| 112 |
+
return False
|
| 113 |
+
return (time.time() - s.updated_at) > self._ttl
|
| 114 |
+
|
| 115 |
+
def sweep(self) -> int:
|
| 116 |
+
"""
|
| 117 |
+
Remove expired sessions. Returns number removed.
|
| 118 |
+
"""
|
| 119 |
+
with self._lock:
|
| 120 |
+
dead = [sid for sid, s in self._sessions.items() if self._expired(s)]
|
| 121 |
+
for sid in dead:
|
| 122 |
+
self._sessions.pop(sid, None)
|
| 123 |
+
return len(dead)
|
| 124 |
+
|
| 125 |
+
# ---- history ops ----
|
| 126 |
+
|
| 127 |
+
def append_user(self, session_id: str, text: str) -> Session:
|
| 128 |
+
return self._append(session_id, "user", text)
|
| 129 |
+
|
| 130 |
+
def append_bot(self, session_id: str, text: str) -> Session:
|
| 131 |
+
return self._append(session_id, "bot", text)
|
| 132 |
+
|
| 133 |
+
def _append(self, session_id: str, who: str, text: str) -> Session:
|
| 134 |
+
with self._lock:
|
| 135 |
+
s = self._sessions.get(session_id)
|
| 136 |
+
if s is None:
|
| 137 |
+
s = self.create(session_id=session_id)
|
| 138 |
+
s.history.append((who, text))
|
| 139 |
+
if self._max_history and len(s.history) > self._max_history:
|
| 140 |
+
# Keep most recent N entries
|
| 141 |
+
s.history = s.history[-self._max_history :]
|
| 142 |
+
s.updated_at = time.time()
|
| 143 |
+
return s
|
| 144 |
+
|
| 145 |
+
def get_history(self, session_id: str) -> History:
|
| 146 |
+
with self._lock:
|
| 147 |
+
s = self._sessions.get(session_id)
|
| 148 |
+
return list(s.history) if s else []
|
| 149 |
+
|
| 150 |
+
def clear_history(self, session_id: str) -> bool:
|
| 151 |
+
with self._lock:
|
| 152 |
+
s = self._sessions.get(session_id)
|
| 153 |
+
if not s:
|
| 154 |
+
return False
|
| 155 |
+
s.history.clear()
|
| 156 |
+
s.updated_at = time.time()
|
| 157 |
+
return True
|
| 158 |
+
|
| 159 |
+
# ---- key/value per-session data ----
|
| 160 |
+
|
| 161 |
+
def set(self, session_id: str, key: str, value: Any) -> Session:
|
| 162 |
+
with self._lock:
|
| 163 |
+
s = self._sessions.get(session_id)
|
| 164 |
+
if s is None:
|
| 165 |
+
s = self.create(session_id=session_id)
|
| 166 |
+
s.data[key] = value
|
| 167 |
+
s.updated_at = time.time()
|
| 168 |
+
return s
|
| 169 |
+
|
| 170 |
+
def get_value(self, session_id: str, key: str, default: Any = None) -> Any:
|
| 171 |
+
with self._lock:
|
| 172 |
+
s = self._sessions.get(session_id)
|
| 173 |
+
if not s:
|
| 174 |
+
return default
|
| 175 |
+
return s.data.get(key, default)
|
| 176 |
+
|
| 177 |
+
def data_dict(self, session_id: str) -> Dict[str, Any]:
|
| 178 |
+
with self._lock:
|
| 179 |
+
s = self._sessions.get(session_id)
|
| 180 |
+
return dict(s.data) if s else {}
|
| 181 |
+
|
| 182 |
+
# ---- persistence ----
|
| 183 |
+
|
| 184 |
+
def save(self, path: str | Path) -> None:
|
| 185 |
+
p = Path(path)
|
| 186 |
+
p.parent.mkdir(parents=True, exist_ok=True)
|
| 187 |
+
with self._lock:
|
| 188 |
+
payload = {
|
| 189 |
+
"ttl_seconds": self._ttl,
|
| 190 |
+
"max_history": self._max_history,
|
| 191 |
+
"saved_at": time.time(),
|
| 192 |
+
"sessions": {sid: s.to_dict() for sid, s in self._sessions.items()},
|
| 193 |
+
}
|
| 194 |
+
p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
|
| 195 |
+
|
| 196 |
+
@classmethod
|
| 197 |
+
def load(cls, path: str | Path) -> "SessionStore":
|
| 198 |
+
p = Path(path)
|
| 199 |
+
if not p.is_file():
|
| 200 |
+
return cls()
|
| 201 |
+
data = json.loads(p.read_text(encoding="utf-8"))
|
| 202 |
+
store = cls(
|
| 203 |
+
ttl_seconds=data.get("ttl_seconds"),
|
| 204 |
+
max_history=int(data.get("max_history", 200)),
|
| 205 |
+
)
|
| 206 |
+
sessions = data.get("sessions", {})
|
| 207 |
+
with store._lock:
|
| 208 |
+
for sid, sd in sessions.items():
|
| 209 |
+
store._sessions[sid] = Session.from_dict(sd)
|
| 210 |
+
return store
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# -----------------------------
|
| 214 |
+
# Module-level singleton (optional)
|
| 215 |
+
# -----------------------------
|
| 216 |
+
|
| 217 |
+
_default_store: Optional[SessionStore] = None
|
| 218 |
+
|
| 219 |
+
def get_store() -> SessionStore:
|
| 220 |
+
global _default_store
|
| 221 |
+
if _default_store is None:
|
| 222 |
+
_default_store = SessionStore()
|
| 223 |
+
return _default_store
|
| 224 |
+
|
| 225 |
+
def new_session(user_id: Optional[str] = None) -> Session:
|
| 226 |
+
return get_store().create(user_id=user_id)
|
| 227 |
+
|
| 228 |
+
def append_user(session_id: str, text: str) -> Session:
|
| 229 |
+
return get_store().append_user(session_id, text)
|
| 230 |
+
|
| 231 |
+
def append_bot(session_id: str, text: str) -> Session:
|
| 232 |
+
return get_store().append_bot(session_id, text)
|
| 233 |
+
|
| 234 |
+
def history(session_id: str) -> History:
|
| 235 |
+
return get_store().get_history(session_id)
|
| 236 |
+
|
| 237 |
+
def set_value(session_id: str, key: str, value: Any) -> Session:
|
| 238 |
+
return get_store().set(session_id, key, value)
|
| 239 |
+
|
| 240 |
+
def get_value(session_id: str, key: str, default: Any = None) -> Any:
|
| 241 |
+
return get_store().get_value(session_id, key, default)
|
| 242 |
+
|
| 243 |
+
def sweep() -> int:
|
| 244 |
+
return get_store().sweep()
|
memory/store.py
CHANGED
|
@@ -1,3 +1,145 @@
|
|
| 1 |
# /memory/sessions.py
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /memory/sessions.py
|
| 2 |
+
"""
|
| 3 |
+
Simple in-memory session manager for chatbot history.
|
| 4 |
+
Supports TTL, max history, and JSON persistence.
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
import time, json, uuid
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 12 |
+
|
| 13 |
+
History = List[Tuple[str, str]] # [("user","..."), ("bot","...")]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class Session:
|
| 18 |
+
session_id: str
|
| 19 |
+
user_id: Optional[str] = None
|
| 20 |
+
history: History = field(default_factory=list)
|
| 21 |
+
data: Dict[str, Any] = field(default_factory=dict)
|
| 22 |
+
created_at: float = field(default_factory=time.time)
|
| 23 |
+
updated_at: float = field(default_factory=time.time)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SessionStore:
|
| 27 |
+
def __init__(self, ttl_seconds: Optional[int] = 3600, max_history: Optional[int] = 50):
|
| 28 |
+
self.ttl_seconds = ttl_seconds
|
| 29 |
+
self.max_history = max_history
|
| 30 |
+
self._sessions: Dict[str, Session] = {}
|
| 31 |
+
|
| 32 |
+
# --- internals ---
|
| 33 |
+
def _expired(self, sess: Session) -> bool:
|
| 34 |
+
if self.ttl_seconds is None:
|
| 35 |
+
return False
|
| 36 |
+
return (time.time() - sess.updated_at) > self.ttl_seconds
|
| 37 |
+
|
| 38 |
+
# --- CRUD ---
|
| 39 |
+
def create(self, user_id: Optional[str] = None) -> Session:
|
| 40 |
+
sid = str(uuid.uuid4())
|
| 41 |
+
sess = Session(session_id=sid, user_id=user_id)
|
| 42 |
+
self._sessions[sid] = sess
|
| 43 |
+
return sess
|
| 44 |
+
|
| 45 |
+
def get(self, sid: str) -> Optional[Session]:
|
| 46 |
+
return self._sessions.get(sid)
|
| 47 |
+
|
| 48 |
+
def get_history(self, sid: str) -> History:
|
| 49 |
+
sess = self.get(sid)
|
| 50 |
+
return list(sess.history) if sess else []
|
| 51 |
+
|
| 52 |
+
def append_user(self, sid: str, text: str) -> None:
|
| 53 |
+
self._append(sid, "user", text)
|
| 54 |
+
|
| 55 |
+
def append_bot(self, sid: str, text: str) -> None:
|
| 56 |
+
self._append(sid, "bot", text)
|
| 57 |
+
|
| 58 |
+
def _append(self, sid: str, who: str, text: str) -> None:
|
| 59 |
+
sess = self.get(sid)
|
| 60 |
+
if not sess:
|
| 61 |
+
return
|
| 62 |
+
sess.history.append((who, text))
|
| 63 |
+
if self.max_history and len(sess.history) > self.max_history:
|
| 64 |
+
sess.history = sess.history[-self.max_history:]
|
| 65 |
+
sess.updated_at = time.time()
|
| 66 |
+
|
| 67 |
+
# --- Data store ---
|
| 68 |
+
def set(self, sid: str, key: str, value: Any) -> None:
|
| 69 |
+
sess = self.get(sid)
|
| 70 |
+
if sess:
|
| 71 |
+
sess.data[key] = value
|
| 72 |
+
sess.updated_at = time.time()
|
| 73 |
+
|
| 74 |
+
def get_value(self, sid: str, key: str, default=None) -> Any:
|
| 75 |
+
sess = self.get(sid)
|
| 76 |
+
return sess.data.get(key, default) if sess else default
|
| 77 |
+
|
| 78 |
+
def data_dict(self, sid: str) -> Dict[str, Any]:
|
| 79 |
+
sess = self.get(sid)
|
| 80 |
+
return dict(sess.data) if sess else {}
|
| 81 |
+
|
| 82 |
+
# --- TTL management ---
|
| 83 |
+
def sweep(self) -> int:
|
| 84 |
+
"""Remove expired sessions; return count removed."""
|
| 85 |
+
expired = [sid for sid, s in self._sessions.items() if self._expired(s)]
|
| 86 |
+
for sid in expired:
|
| 87 |
+
self._sessions.pop(sid, None)
|
| 88 |
+
return len(expired)
|
| 89 |
+
|
| 90 |
+
def all_ids(self):
|
| 91 |
+
return list(self._sessions.keys())
|
| 92 |
+
|
| 93 |
+
# --- persistence ---
|
| 94 |
+
def save(self, path: Path) -> None:
|
| 95 |
+
payload = {
|
| 96 |
+
sid: {
|
| 97 |
+
"user_id": s.user_id,
|
| 98 |
+
"history": s.history,
|
| 99 |
+
"data": s.data,
|
| 100 |
+
"created_at": s.created_at,
|
| 101 |
+
"updated_at": s.updated_at,
|
| 102 |
+
}
|
| 103 |
+
for sid, s in self._sessions.items()
|
| 104 |
+
}
|
| 105 |
+
path.write_text(json.dumps(payload, indent=2))
|
| 106 |
+
|
| 107 |
+
@classmethod
|
| 108 |
+
def load(cls, path: Path) -> "SessionStore":
|
| 109 |
+
store = cls()
|
| 110 |
+
if not path.exists():
|
| 111 |
+
return store
|
| 112 |
+
raw = json.loads(path.read_text())
|
| 113 |
+
for sid, d in raw.items():
|
| 114 |
+
s = Session(
|
| 115 |
+
session_id=sid,
|
| 116 |
+
user_id=d.get("user_id"),
|
| 117 |
+
history=d.get("history", []),
|
| 118 |
+
data=d.get("data", {}),
|
| 119 |
+
created_at=d.get("created_at", time.time()),
|
| 120 |
+
updated_at=d.get("updated_at", time.time()),
|
| 121 |
+
)
|
| 122 |
+
store._sessions[sid] = s
|
| 123 |
+
return store
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# --- Module-level singleton for convenience ---
|
| 127 |
+
_store = SessionStore()
|
| 128 |
+
|
| 129 |
+
def new_session(user_id: Optional[str] = None) -> Session:
|
| 130 |
+
return _store.create(user_id)
|
| 131 |
+
|
| 132 |
+
def history(sid: str) -> History:
|
| 133 |
+
return _store.get_history(sid)
|
| 134 |
+
|
| 135 |
+
def append_user(sid: str, text: str) -> None:
|
| 136 |
+
_store.append_user(sid, text)
|
| 137 |
+
|
| 138 |
+
def append_bot(sid: str, text: str) -> None:
|
| 139 |
+
_store.append_bot(sid, text)
|
| 140 |
+
|
| 141 |
+
def set_value(sid: str, key: str, value: Any) -> None:
|
| 142 |
+
_store.set(sid, key, value)
|
| 143 |
+
|
| 144 |
+
def get_value(sid: str, key: str, default=None) -> Any:
|
| 145 |
+
return _store.get_value(sid, key, default)
|
nlu/pipeline.py
CHANGED
|
@@ -1,3 +1,77 @@
|
|
| 1 |
# /nlu/pipeline.py
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /nlu/pipeline.py
|
| 2 |
+
"""
|
| 3 |
+
Lightweight rule-based NLU pipeline.
|
| 4 |
|
| 5 |
+
No ML dependencies β just keyword matching and simple heuristics.
|
| 6 |
+
Provides intent classification and placeholder entity extraction.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, List
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# keyword β intent maps
|
| 13 |
+
_INTENT_KEYWORDS = {
|
| 14 |
+
"greeting": {"hi", "hello", "hey", "good morning", "good evening"},
|
| 15 |
+
"goodbye": {"bye", "goodbye", "see you", "farewell"},
|
| 16 |
+
"help": {"help", "support", "assist", "how do i"},
|
| 17 |
+
"faq": {"what is", "who is", "where is", "when is", "how to"},
|
| 18 |
+
"sentiment_positive": {"great", "awesome", "fantastic", "love"},
|
| 19 |
+
"sentiment_negative": {"bad", "terrible", "hate", "awful"},
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _match_intent(text: str) -> str:
|
| 24 |
+
low = text.lower().strip()
|
| 25 |
+
for intent, kws in _INTENT_KEYWORDS.items():
|
| 26 |
+
for kw in kws:
|
| 27 |
+
if kw in low:
|
| 28 |
+
return intent
|
| 29 |
+
return "general"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _extract_entities(text: str) -> List[str]:
|
| 33 |
+
"""
|
| 34 |
+
Placeholder entity extractor.
|
| 35 |
+
For now just returns capitalized words (could be names/places).
|
| 36 |
+
"""
|
| 37 |
+
return [w for w in text.split() if w.istitle()]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def analyze(text: str) -> Dict:
|
| 41 |
+
"""
|
| 42 |
+
Analyze a user utterance.
|
| 43 |
+
Returns:
|
| 44 |
+
{
|
| 45 |
+
"intent": str,
|
| 46 |
+
"entities": list[str],
|
| 47 |
+
"confidence": float
|
| 48 |
+
}
|
| 49 |
+
"""
|
| 50 |
+
if not text or not text.strip():
|
| 51 |
+
return {"intent": "general", "entities": [], "confidence": 0.0}
|
| 52 |
+
|
| 53 |
+
intent = _match_intent(text)
|
| 54 |
+
entities = _extract_entities(text)
|
| 55 |
+
|
| 56 |
+
# crude confidence: matched keyword = 0.9, else fallback = 0.5
|
| 57 |
+
confidence = 0.9 if intent != "general" else 0.5
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
"intent": intent,
|
| 61 |
+
"entities": entities,
|
| 62 |
+
"confidence": confidence,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# quick test
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
tests = [
|
| 69 |
+
"Hello there",
|
| 70 |
+
"Can you help me?",
|
| 71 |
+
"I love this bot!",
|
| 72 |
+
"Bye now",
|
| 73 |
+
"Tell me what is RAG",
|
| 74 |
+
"random input with no keywords",
|
| 75 |
+
]
|
| 76 |
+
for t in tests:
|
| 77 |
+
print(t, "->", analyze(t))
|
nlu/prompts.py
CHANGED
|
@@ -1 +1,78 @@
|
|
| 1 |
# /nlu/prompts.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /nlu/prompts.py
|
| 2 |
+
"""
|
| 3 |
+
Reusable prompt templates for NLU and chatbot responses.
|
| 4 |
+
|
| 5 |
+
These can be imported anywhere in the app to keep wording consistent.
|
| 6 |
+
They are plain strings / dicts β no external deps required.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict
|
| 10 |
+
|
| 11 |
+
# -----------------------------
|
| 12 |
+
# System prompts
|
| 13 |
+
# -----------------------------
|
| 14 |
+
|
| 15 |
+
SYSTEM_BASE = """\
|
| 16 |
+
You are a helpful, polite chatbot.
|
| 17 |
+
Answer briefly unless asked for detail.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
SYSTEM_FAQ = """\
|
| 21 |
+
You are a factual Q&A assistant.
|
| 22 |
+
Answer questions directly, citing facts when possible.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
SYSTEM_SUPPORT = """\
|
| 26 |
+
You are a friendly support assistant.
|
| 27 |
+
Offer clear, step-by-step help when the user asks for guidance.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
# -----------------------------
|
| 31 |
+
# Few-shot examples
|
| 32 |
+
# -----------------------------
|
| 33 |
+
|
| 34 |
+
FEW_SHOTS: Dict[str, list] = {
|
| 35 |
+
"greeting": [
|
| 36 |
+
{"user": "Hello", "bot": "Hi there! How can I help you today?"},
|
| 37 |
+
{"user": "Good morning", "bot": "Good morning! Whatβs up?"},
|
| 38 |
+
],
|
| 39 |
+
"goodbye": [
|
| 40 |
+
{"user": "Bye", "bot": "Goodbye! Have a great day."},
|
| 41 |
+
{"user": "See you later", "bot": "See you!"},
|
| 42 |
+
],
|
| 43 |
+
"help": [
|
| 44 |
+
{"user": "I need help", "bot": "Sure! What do you need help with?"},
|
| 45 |
+
{"user": "Can you assist me?", "bot": "Of course, happy to assist."},
|
| 46 |
+
],
|
| 47 |
+
"faq": [
|
| 48 |
+
{"user": "What is RAG?", "bot": "RAG stands for Retrieval-Augmented Generation."},
|
| 49 |
+
{"user": "Who created this bot?", "bot": "It was built by our project team."},
|
| 50 |
+
],
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
# -----------------------------
|
| 54 |
+
# Utility
|
| 55 |
+
# -----------------------------
|
| 56 |
+
|
| 57 |
+
def get_system_prompt(mode: str = "base") -> str:
|
| 58 |
+
"""
|
| 59 |
+
Return a system-level prompt string.
|
| 60 |
+
mode: "base" | "faq" | "support"
|
| 61 |
+
"""
|
| 62 |
+
if mode == "faq":
|
| 63 |
+
return SYSTEM_FAQ
|
| 64 |
+
if mode == "support":
|
| 65 |
+
return SYSTEM_SUPPORT
|
| 66 |
+
return SYSTEM_BASE
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_few_shots(intent: str) -> list:
|
| 70 |
+
"""
|
| 71 |
+
Return few-shot examples for a given intent label.
|
| 72 |
+
"""
|
| 73 |
+
return FEW_SHOTS.get(intent, [])
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
print("System prompt:", get_system_prompt("faq"))
|
| 78 |
+
print("Examples for 'greeting':", get_few_shots("greeting"))
|
nlu/router.py
CHANGED
|
@@ -1 +1,143 @@
|
|
| 1 |
# /nlu/router.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /nlu/router.py
|
| 2 |
+
"""
|
| 3 |
+
Lightweight NLU router.
|
| 4 |
+
|
| 5 |
+
- Uses nlu.pipeline.analyze() to classify the user's intent.
|
| 6 |
+
- Maps intents to high-level actions (GREETING, HELP, FAQ, ECHO, SUMMARIZE, GENERAL, GOODBYE).
|
| 7 |
+
- Provides:
|
| 8 |
+
route(text, ctx=None) -> dict with intent, action, handler, params
|
| 9 |
+
respond(text, history) -> quick deterministic reply for smoke tests
|
| 10 |
+
|
| 11 |
+
This file deliberately avoids external dependencies so it works in anonymous mode.
|
| 12 |
+
Later, you can swap 'handler' targets to real modules (e.g., anon_bot, logged_in_bot).
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
from dataclasses import dataclass, asdict
|
| 17 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 18 |
+
|
| 19 |
+
from .pipeline import analyze
|
| 20 |
+
from .prompts import get_system_prompt, get_few_shots
|
| 21 |
+
|
| 22 |
+
History = List[Tuple[str, str]] # [("user","..."), ("bot","...")]
|
| 23 |
+
|
| 24 |
+
# -----------------------------
|
| 25 |
+
# Action / Route schema
|
| 26 |
+
# -----------------------------
|
| 27 |
+
|
| 28 |
+
@dataclass(frozen=True)
|
| 29 |
+
class Route:
|
| 30 |
+
intent: str
|
| 31 |
+
action: str
|
| 32 |
+
handler: str # suggested dotted path or logical name
|
| 33 |
+
params: Dict[str, Any] # arbitrary params (e.g., {"mode":"faq"})
|
| 34 |
+
confidence: float
|
| 35 |
+
|
| 36 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 37 |
+
return asdict(self)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Intent -> (Action, Suggested Handler, Default Params)
|
| 41 |
+
_ACTION_TABLE: Dict[str, Tuple[str, str, Dict[str, Any]]] = {
|
| 42 |
+
"greeting": ("GREETING", "builtin.respond", {"mode": "base"}),
|
| 43 |
+
"goodbye": ("GOODBYE", "builtin.respond", {"mode": "base"}),
|
| 44 |
+
"help": ("HELP", "builtin.respond", {"mode": "support"}),
|
| 45 |
+
"faq": ("FAQ", "builtin.respond", {"mode": "faq"}),
|
| 46 |
+
# Sentiment intents come from pipeline; treat as GENERAL but note tag:
|
| 47 |
+
"sentiment_positive": ("GENERAL", "builtin.respond", {"mode": "base", "tag": "positive"}),
|
| 48 |
+
"sentiment_negative": ("GENERAL", "builtin.respond", {"mode": "base", "tag": "negative"}),
|
| 49 |
+
# Default:
|
| 50 |
+
"general": ("GENERAL", "builtin.respond", {"mode": "base"}),
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
_DEFAULT_ACTION = ("GENERAL", "builtin.respond", {"mode": "base"})
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# -----------------------------
|
| 57 |
+
# Routing
|
| 58 |
+
# -----------------------------
|
| 59 |
+
|
| 60 |
+
def route(text: str, ctx: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 61 |
+
"""
|
| 62 |
+
Decide which action/handler should process the utterance.
|
| 63 |
+
"""
|
| 64 |
+
nlu = analyze(text or "")
|
| 65 |
+
intent = nlu.get("intent", "general")
|
| 66 |
+
confidence = float(nlu.get("confidence", 0.0))
|
| 67 |
+
action, handler, params = _ACTION_TABLE.get(intent, _DEFAULT_ACTION)
|
| 68 |
+
|
| 69 |
+
# pass-through entities as params for downstream handlers
|
| 70 |
+
entities = nlu.get("entities") or []
|
| 71 |
+
if entities:
|
| 72 |
+
params = {**params, "entities": entities}
|
| 73 |
+
|
| 74 |
+
# include minimal context (optional)
|
| 75 |
+
if ctx:
|
| 76 |
+
params = {**params, "_ctx": ctx}
|
| 77 |
+
|
| 78 |
+
return Route(
|
| 79 |
+
intent=intent,
|
| 80 |
+
action=action,
|
| 81 |
+
handler=handler,
|
| 82 |
+
params=params,
|
| 83 |
+
confidence=confidence,
|
| 84 |
+
).to_dict()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# -----------------------------
|
| 88 |
+
# Built-in deterministic responder (for smoke tests)
|
| 89 |
+
# -----------------------------
|
| 90 |
+
|
| 91 |
+
def respond(text: str, history: Optional[History] = None) -> str:
|
| 92 |
+
"""
|
| 93 |
+
Produce a tiny, deterministic response using system/few-shot text.
|
| 94 |
+
This is only for local testing; replace with real handlers later.
|
| 95 |
+
"""
|
| 96 |
+
r = route(text)
|
| 97 |
+
intent = r["intent"]
|
| 98 |
+
action = r["action"]
|
| 99 |
+
mode = r["params"].get("mode", "base")
|
| 100 |
+
|
| 101 |
+
# Choose a system flavor (not used to prompt a model here, but keeps wording consistent)
|
| 102 |
+
_ = get_system_prompt("support" if action == "HELP" else ("faq" if action == "FAQ" else "base"))
|
| 103 |
+
# Few-shots can inform canned replies (again: no model used, just tone)
|
| 104 |
+
shots = get_few_shots(intent)
|
| 105 |
+
|
| 106 |
+
if action == "GREETING":
|
| 107 |
+
return "Hi! How can I help you today?"
|
| 108 |
+
if action == "GOODBYE":
|
| 109 |
+
return "Goodbye! Have a great day."
|
| 110 |
+
if action == "HELP":
|
| 111 |
+
return "I can answer quick questions, echo text, or summarize short passages. What do you need help with?"
|
| 112 |
+
if action == "FAQ":
|
| 113 |
+
# Trivial FAQ-style echo; swap with RAG later
|
| 114 |
+
return "Ask a specific question (e.g., 'What is RAG?'), and Iβll answer briefly."
|
| 115 |
+
# GENERAL:
|
| 116 |
+
# If the pipeline flagged sentiment, acknowledge gently.
|
| 117 |
+
tag = r["params"].get("tag")
|
| 118 |
+
if tag == "positive":
|
| 119 |
+
prefix = "Glad to hear it! "
|
| 120 |
+
elif tag == "negative":
|
| 121 |
+
prefix = "Sorry to hear that. "
|
| 122 |
+
else:
|
| 123 |
+
prefix = ""
|
| 124 |
+
return prefix + "Noted. If you need help, type 'help'."
|
| 125 |
+
|
| 126 |
+
# -----------------------------
|
| 127 |
+
# Simple CLI smoke test
|
| 128 |
+
# -----------------------------
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
tests = [
|
| 132 |
+
"Hello there",
|
| 133 |
+
"Can you help me?",
|
| 134 |
+
"What is RAG in simple terms?",
|
| 135 |
+
"This is awful.",
|
| 136 |
+
"Bye!",
|
| 137 |
+
"random input with no keywords",
|
| 138 |
+
]
|
| 139 |
+
for t in tests:
|
| 140 |
+
print(f"> {t}")
|
| 141 |
+
print(" route:", route(t))
|
| 142 |
+
print(" reply:", respond(t))
|
| 143 |
+
print()
|
requirements-dev.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytest>=7.4.0
|
| 2 |
+
pytest-cov>=4.1.0
|
| 3 |
+
black>=24.3.0
|
| 4 |
+
isort>=5.13.0
|
| 5 |
+
flake8>=7.0.0
|
| 6 |
+
mypy>=1.10.0
|
| 7 |
+
ruff>=0.5.0
|
requirements-ml.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers>=4.41.0
|
| 2 |
+
torch>=2.2.0
|
| 3 |
+
|
| 4 |
+
# extras commonly required by transformers
|
| 5 |
+
safetensors>=0.4.0
|
| 6 |
+
accelerate>=0.33.0
|
| 7 |
+
sentencepiece>=0.2.0
|
requirements.txt
CHANGED
|
@@ -1,15 +1,12 @@
|
|
| 1 |
-
gradio>=4.0
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
| 6 |
numpy>=1.26.0
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
azure-ai-textanalytics>=5.3.0
|
| 10 |
-
python-dotenv>=1.0
|
| 11 |
-
fastapi>=0.115.0
|
| 12 |
-
uvicorn[standard]>=0.30.0
|
| 13 |
-
# Optional for Bot Framework sample:
|
| 14 |
-
# aiohttp>=3.9
|
| 15 |
-
# botbuilder-core>=4.14
|
|
|
|
| 1 |
+
gradio>=4.0,<5
|
| 2 |
+
fastapi>=0.115.0,<0.116
|
| 3 |
+
uvicorn[standard]>=0.30.0,<0.31
|
| 4 |
+
python-dotenv>=1.0
|
| 5 |
+
|
| 6 |
+
# light numeric stack
|
| 7 |
numpy>=1.26.0
|
| 8 |
+
pandas>=2.1.0
|
| 9 |
+
scikit-learn>=1.3.0
|
| 10 |
+
|
| 11 |
+
# optional Azure integration
|
| 12 |
azure-ai-textanalytics>=5.3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/check_compliance.py
CHANGED
|
@@ -1,3 +1,81 @@
|
|
| 1 |
# /scripts/check_compliance.py
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /scripts/check_compliance.py
|
| 2 |
+
#!/usr/bin/env python3
|
| 3 |
+
"""
|
| 4 |
+
Compliance checker for disallowed dependencies.
|
| 5 |
|
| 6 |
+
- Scans all .py files under project root (excluding venv/.git/etc).
|
| 7 |
+
- Flags imports of disallowed packages (by prefix).
|
| 8 |
+
- Exits nonzero if any violations are found.
|
| 9 |
+
|
| 10 |
+
Run:
|
| 11 |
+
python scripts/check_compliance.py
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
import re
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
# -----------------------------
|
| 20 |
+
# Config
|
| 21 |
+
# -----------------------------
|
| 22 |
+
|
| 23 |
+
# Disallowed top-level import prefixes
|
| 24 |
+
DISALLOWED = {
|
| 25 |
+
"torch",
|
| 26 |
+
"tensorflow",
|
| 27 |
+
"transformers",
|
| 28 |
+
"openai",
|
| 29 |
+
"azure.ai", # heavy cloud SDK
|
| 30 |
+
"azureml",
|
| 31 |
+
"boto3",
|
| 32 |
+
"botbuilder", # Microsoft Bot Framework
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
IGNORE_DIRS = {".git", "__pycache__", "venv", ".venv", "env", ".env", "node_modules"}
|
| 36 |
+
|
| 37 |
+
IMPORT_RE = re.compile(r"^\s*(?:import|from)\s+([a-zA-Z0-9_.]+)")
|
| 38 |
+
|
| 39 |
+
# -----------------------------
|
| 40 |
+
# Scan
|
| 41 |
+
# -----------------------------
|
| 42 |
+
|
| 43 |
+
def scan_file(path: Path) -> list[str]:
|
| 44 |
+
bad = []
|
| 45 |
+
try:
|
| 46 |
+
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"[warn] could not read {path}: {e}", file=sys.stderr)
|
| 49 |
+
return []
|
| 50 |
+
for i, line in enumerate(lines, 1):
|
| 51 |
+
m = IMPORT_RE.match(line)
|
| 52 |
+
if not m:
|
| 53 |
+
continue
|
| 54 |
+
mod = m.group(1)
|
| 55 |
+
for banned in DISALLOWED:
|
| 56 |
+
if mod == banned or mod.startswith(banned + "."):
|
| 57 |
+
bad.append(f"{path}:{i}: disallowed import '{mod}'")
|
| 58 |
+
return bad
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def main(root: str = ".") -> int:
|
| 62 |
+
root = Path(root)
|
| 63 |
+
failures: list[str] = []
|
| 64 |
+
|
| 65 |
+
for p in root.rglob("*.py"):
|
| 66 |
+
if any(part in IGNORE_DIRS for part in p.parts):
|
| 67 |
+
continue
|
| 68 |
+
failures.extend(scan_file(p))
|
| 69 |
+
|
| 70 |
+
if failures:
|
| 71 |
+
print("β Compliance check failed:")
|
| 72 |
+
for f in failures:
|
| 73 |
+
print(" ", f)
|
| 74 |
+
return 1
|
| 75 |
+
else:
|
| 76 |
+
print("β
Compliance check passed (no disallowed deps).")
|
| 77 |
+
return 0
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
sys.exit(main())
|
scripts/run_local.sh
CHANGED
|
@@ -1,5 +1,45 @@
|
|
| 1 |
# /scripts/run_local.sh
|
| 2 |
#!/usr/bin/env bash
|
| 3 |
-
set -
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /scripts/run_local.sh
|
| 2 |
#!/usr/bin/env bash
|
| 3 |
+
set -Eeuo pipefail
|
| 4 |
+
|
| 5 |
+
# Move to repo root
|
| 6 |
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
cd "$ROOT_DIR"
|
| 8 |
+
|
| 9 |
+
# --- Configuration via env (with sane defaults) ---
|
| 10 |
+
export PYTHONPATH="${PYTHONPATH:-.}"
|
| 11 |
+
HOST="${HOST:-0.0.0.0}"
|
| 12 |
+
PORT="${PORT:-7860}"
|
| 13 |
+
MODE="${MODE:-gradio}" # gradio | uvicorn
|
| 14 |
+
RELOAD="${RELOAD:-false}" # only applies to MODE=uvicorn
|
| 15 |
+
INSTALL="${INSTALL:-0}" # set INSTALL=1 to pip install requirements
|
| 16 |
+
|
| 17 |
+
# Load .env if present (ignore comments/blank lines)
|
| 18 |
+
if [[ -f .env ]]; then
|
| 19 |
+
# shellcheck disable=SC2046
|
| 20 |
+
export $(grep -vE '^\s*#' .env | grep -vE '^\s*$' | xargs -0 -I{} bash -c 'printf "%s\0" "{}"' 2>/dev/null || true)
|
| 21 |
+
fi
|
| 22 |
+
|
| 23 |
+
if [[ "$INSTALL" == "1" ]]; then
|
| 24 |
+
echo "π¦ Installing dependencies from requirements.txt ..."
|
| 25 |
+
python -m pip install -r requirements.txt
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
trap 'echo; echo "β Server terminated";' INT TERM
|
| 29 |
+
|
| 30 |
+
if [[ "$MODE" == "uvicorn" ]]; then
|
| 31 |
+
# Dev-friendly server with optional reload (expects FastAPI app factory)
|
| 32 |
+
echo "βΆ Starting Uvicorn on http://${HOST}:${PORT} (reload=${RELOAD})"
|
| 33 |
+
# If you expose a FastAPI app object directly, adjust target accordingly (e.g., storefront_chatbot.app.app:app)
|
| 34 |
+
cmd=(python -m uvicorn storefront_chatbot.app.app:build --host "$HOST" --port "$PORT")
|
| 35 |
+
[[ "$RELOAD" == "true" ]] && cmd+=(--reload)
|
| 36 |
+
exec "${cmd[@]}"
|
| 37 |
+
else
|
| 38 |
+
# Gradio path (matches your original build().launch)
|
| 39 |
+
echo "βΆ Starting Gradio on http://${HOST}:${PORT}"
|
| 40 |
+
python - <<PY
|
| 41 |
+
from storefront_chatbot.app.app import build
|
| 42 |
+
app = build()
|
| 43 |
+
app.launch(server_name="${HOST}", server_port=${PORT})
|
| 44 |
+
PY
|
| 45 |
+
fi
|
scripts/seed_data.py
CHANGED
|
@@ -1,3 +1,94 @@
|
|
| 1 |
# /scripts/seed_data.py
|
| 2 |
-
#
|
|
|
|
|
|
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /scripts/seed_data.py
|
| 2 |
+
#!/usr/bin/env python3
|
| 3 |
+
"""
|
| 4 |
+
Seed script to load sample products and FAQs into local data files.
|
| 5 |
|
| 6 |
+
- Creates ./data/products.json and ./data/faqs.json
|
| 7 |
+
- Provides a CLI to re-seed or show contents
|
| 8 |
+
- No external dependencies required
|
| 9 |
+
|
| 10 |
+
Run:
|
| 11 |
+
python scripts/seed_data.py # create seed files
|
| 12 |
+
python scripts/seed_data.py show # print contents
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import sys
|
| 16 |
+
import json
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
import datetime
|
| 19 |
+
|
| 20 |
+
ROOT = Path(__file__).resolve().parent.parent
|
| 21 |
+
DATA_DIR = ROOT / "data"
|
| 22 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 23 |
+
|
| 24 |
+
PRODUCTS_PATH = DATA_DIR / "products.json"
|
| 25 |
+
FAQS_PATH = DATA_DIR / "faqs.json"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
SAMPLE_PRODUCTS = [
|
| 29 |
+
{
|
| 30 |
+
"id": "p1",
|
| 31 |
+
"name": "Chatbot Pro Subscription",
|
| 32 |
+
"description": "Access advanced features of the chatbot platform.",
|
| 33 |
+
"price": 9.99,
|
| 34 |
+
"currency": "USD",
|
| 35 |
+
"tags": ["subscription", "chatbot"],
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"id": "p2",
|
| 39 |
+
"name": "Custom Bot Avatar",
|
| 40 |
+
"description": "A personalized avatar for your chatbot.",
|
| 41 |
+
"price": 4.99,
|
| 42 |
+
"currency": "USD",
|
| 43 |
+
"tags": ["avatar", "customization"],
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"id": "p3",
|
| 47 |
+
"name": "Analytics Dashboard",
|
| 48 |
+
"description": "Real-time analytics and reporting for your conversations.",
|
| 49 |
+
"price": 14.99,
|
| 50 |
+
"currency": "USD",
|
| 51 |
+
"tags": ["analytics", "dashboard"],
|
| 52 |
+
},
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
SAMPLE_FAQS = [
|
| 56 |
+
{
|
| 57 |
+
"q": "How do I reset my password?",
|
| 58 |
+
"a": "Click 'Forgot password' on the login page and follow the instructions.",
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"q": "Can I export my chat history?",
|
| 62 |
+
"a": "Yes, you can export your chat history from the account settings page.",
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"q": "Do you offer refunds?",
|
| 66 |
+
"a": "Refunds are available within 14 days of purchase. Contact support for help.",
|
| 67 |
+
},
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def write_json(path: Path, data) -> None:
|
| 72 |
+
path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def seed() -> None:
|
| 76 |
+
write_json(PRODUCTS_PATH, SAMPLE_PRODUCTS)
|
| 77 |
+
write_json(FAQS_PATH, SAMPLE_FAQS)
|
| 78 |
+
print(f"β
Seeded data at {datetime.date.today()} into {DATA_DIR}")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def show() -> None:
|
| 82 |
+
if PRODUCTS_PATH.is_file():
|
| 83 |
+
print("Products:")
|
| 84 |
+
print(PRODUCTS_PATH.read_text(encoding="utf-8"))
|
| 85 |
+
if FAQS_PATH.is_file():
|
| 86 |
+
print("\nFAQs:")
|
| 87 |
+
print(FAQS_PATH.read_text(encoding="utf-8"))
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
if len(sys.argv) > 1 and sys.argv[1] == "show":
|
| 92 |
+
show()
|
| 93 |
+
else:
|
| 94 |
+
seed()
|
tests/test_anon_bot.py
CHANGED
|
@@ -1,3 +1,121 @@
|
|
| 1 |
# /test/test_anon_bot.py
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
|
|
|
| 1 |
# /test/test_anon_bot.py
|
| 2 |
+
"""
|
| 3 |
+
Comprehensive smoke tests for anon_bot.
|
| 4 |
+
Run with: pytest -q
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
from anon_bot import handler, rules
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ---------- rules: intents & handlers ----------
|
| 12 |
+
|
| 13 |
+
@pytest.mark.parametrize(
|
| 14 |
+
"msg,expected",
|
| 15 |
+
[
|
| 16 |
+
("", "empty"),
|
| 17 |
+
("help", "help"),
|
| 18 |
+
("/help", "help"),
|
| 19 |
+
("capabilities", "help"),
|
| 20 |
+
("reverse abc", "reverse"),
|
| 21 |
+
("echo hello world", "echo"),
|
| 22 |
+
("hi", "greet"),
|
| 23 |
+
("hello", "greet"),
|
| 24 |
+
("hey", "greet"),
|
| 25 |
+
("who are you", "chat"),
|
| 26 |
+
],
|
| 27 |
+
)
|
| 28 |
+
def test_rules_intent_of(msg, expected):
|
| 29 |
+
assert rules.intent_of(msg) == expected
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_rules_capabilities_contains_expected_items():
|
| 33 |
+
caps = rules.capabilities()
|
| 34 |
+
assert "help" in caps
|
| 35 |
+
assert any(c.startswith("reverse") for c in caps)
|
| 36 |
+
assert any(c.startswith("echo") for c in caps)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def test_rules_handlers_basic():
|
| 40 |
+
assert "I can:" in rules.handle_help().text
|
| 41 |
+
assert rules.handle_reverse("reverse hello").text == "olleh"
|
| 42 |
+
assert rules.handle_reverse("reverse").text == "(nothing to reverse)"
|
| 43 |
+
assert rules.handle_echo("echo one two").text == "one two"
|
| 44 |
+
assert rules.handle_echo("echo").text == "(nothing to echo)"
|
| 45 |
+
assert "Type 'help'" in rules.handle_greet().text
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def test_rules_reply_for_empty_and_chat_paths():
|
| 49 |
+
r = rules.reply_for("", [])
|
| 50 |
+
assert "Please type something" in r.text
|
| 51 |
+
|
| 52 |
+
r2 = rules.reply_for("who are you", [])
|
| 53 |
+
assert "tiny anonymous chatbot" in r2.text
|
| 54 |
+
|
| 55 |
+
r3 = rules.reply_for("can you help me", [])
|
| 56 |
+
assert "I can:" in r3.text # chat fallback detects 'help' and returns help
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# ---------- handler: history & turn processing ----------
|
| 60 |
+
|
| 61 |
+
def test_handle_turn_appends_user_and_bot():
|
| 62 |
+
hist = []
|
| 63 |
+
out = handler.handle_turn("hello", hist, user=None)
|
| 64 |
+
# last two entries should be ("user", ...), ("bot", ...)
|
| 65 |
+
assert out[-2][0] == "user" and out[-2][1] == "hello"
|
| 66 |
+
assert out[-1][0] == "bot" and "Type 'help'" in out[-1][1]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_handle_turn_with_existing_history_preserves_items():
|
| 70 |
+
h2 = [("user", "prev"), ("bot", "ok")]
|
| 71 |
+
out2 = handler.handle_turn("echo ping", h2, user=None)
|
| 72 |
+
assert out2[:2] == h2 # preserved
|
| 73 |
+
assert out2[-1][0] == "bot"
|
| 74 |
+
assert out2[-1][1] == "ping" # echo payload
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_handle_text_convenience():
|
| 78 |
+
reply = handler.handle_text("reverse abc")
|
| 79 |
+
assert reply == "cba"
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def test_handle_turn_empty_message_produces_prompt():
|
| 83 |
+
out = handler.handle_turn("", [], user=None)
|
| 84 |
+
assert out[-1][0] == "bot"
|
| 85 |
+
assert "Please type" in out[-1][1]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_handler_coerces_weird_history_without_crashing():
|
| 89 |
+
# Mix of tuples, lists, malformed entries, and non-iterables
|
| 90 |
+
weird = [
|
| 91 |
+
("user", "ok"),
|
| 92 |
+
["bot", "fine"],
|
| 93 |
+
"garbage",
|
| 94 |
+
("only_one_element",),
|
| 95 |
+
("user", 123),
|
| 96 |
+
42,
|
| 97 |
+
None,
|
| 98 |
+
]
|
| 99 |
+
out = handler.handle_turn("hi", weird, user=None)
|
| 100 |
+
# Should include a normalized user entry and a bot reply at the end
|
| 101 |
+
assert out[-2] == ("user", "hi")
|
| 102 |
+
assert out[-1][0] == "bot"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ---------- end-to-end mini scriptable checks ----------
|
| 106 |
+
|
| 107 |
+
def test_greet_help_echo_reverse_flow():
|
| 108 |
+
h = []
|
| 109 |
+
h = handler.handle_turn("hi", h, None)
|
| 110 |
+
assert "help" in h[-1][1].lower()
|
| 111 |
+
|
| 112 |
+
h = handler.handle_turn("help", h, None)
|
| 113 |
+
assert "I can:" in h[-1][1]
|
| 114 |
+
|
| 115 |
+
h = handler.handle_turn("echo alpha beta", h, None)
|
| 116 |
+
assert h[-1][1] == "alpha beta"
|
| 117 |
+
|
| 118 |
+
h = handler.handle_turn("reverse zed", h, None)
|
| 119 |
+
assert h[-1][1] == "dez"
|
| 120 |
+
|
| 121 |
|
tests/test_guardrails.py
CHANGED
|
@@ -1,2 +1,40 @@
|
|
| 1 |
# /test/test_guardrails.py
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /test/test_guardrails.py
|
| 2 |
+
"""
|
| 3 |
+
Guardrail tests:
|
| 4 |
+
- Ensure compliance checker passes (no disallowed deps imported).
|
| 5 |
+
- Ensure anon_bot.rules doesn't produce unsafe replies for empty / bad input.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import subprocess
|
| 9 |
+
import sys
|
| 10 |
+
import pathlib
|
| 11 |
+
|
| 12 |
+
import pytest
|
| 13 |
+
|
| 14 |
+
from anon_bot import rules
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_compliance_script_runs_clean():
|
| 18 |
+
root = pathlib.Path(__file__).resolve().parent.parent
|
| 19 |
+
script = root / "scripts" / "check_compliance.py"
|
| 20 |
+
# Run as a subprocess so we catch real exit code
|
| 21 |
+
proc = subprocess.run([sys.executable, str(script)], capture_output=True, text=True)
|
| 22 |
+
# If it fails, dump output for debugging
|
| 23 |
+
if proc.returncode != 0:
|
| 24 |
+
print(proc.stdout)
|
| 25 |
+
print(proc.stderr, file=sys.stderr)
|
| 26 |
+
assert proc.returncode == 0
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@pytest.mark.parametrize("msg", ["", None, " "])
|
| 30 |
+
def test_rules_empty_prompts_are_safe(msg):
|
| 31 |
+
r = rules.reply_for(msg or "", [])
|
| 32 |
+
# Should politely nudge the user, not crash
|
| 33 |
+
assert "Please" in r.text or "help" in r.text.lower()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@pytest.mark.parametrize("msg", ["rm -rf /", "DROP TABLE users;"])
|
| 37 |
+
def test_rules_handles_malicious_looking_input(msg):
|
| 38 |
+
r = rules.reply_for(msg, [])
|
| 39 |
+
# The bot should fall back safely to generic chat response
|
| 40 |
+
assert "Noted" in r.text or "help" in r.text
|
tests/test_indexer.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /tests/test_indexer.py
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from memory.rag.data.indexer import TfidfIndex, search, DEFAULT_INDEX_PATH
|
| 4 |
+
|
| 5 |
+
def test_add_and_search(tmp_path: Path):
|
| 6 |
+
p = tmp_path / "a.md"
|
| 7 |
+
p.write_text("Hello world. This is an anonymous chatbot.\nRules are simple.", encoding="utf-8")
|
| 8 |
+
idx = TfidfIndex()
|
| 9 |
+
idx.add_file(p)
|
| 10 |
+
hits = idx.search("anonymous rules", k=5)
|
| 11 |
+
assert hits and hits[0].doc_id == str(p.resolve())
|
| 12 |
+
|
| 13 |
+
def test_persist_and_load(tmp_path: Path):
|
| 14 |
+
p = tmp_path / "index.json"
|
| 15 |
+
idx = TfidfIndex()
|
| 16 |
+
idx.add_text("id1", "cats are great, dogs are cool", meta=__meta("id1"))
|
| 17 |
+
idx.save(p)
|
| 18 |
+
loaded = TfidfIndex.load(p)
|
| 19 |
+
hits = loaded.search("dogs", k=1)
|
| 20 |
+
assert hits and hits[0].doc_id == "id1"
|
| 21 |
+
|
| 22 |
+
def __meta(i: str):
|
| 23 |
+
from memory.rag.data.indexer import DocMeta
|
| 24 |
+
return DocMeta(doc_id=i, source="inline", title=i)
|
tests/test_logged_in_bot.py
CHANGED
|
@@ -1,2 +1,84 @@
|
|
| 1 |
# /test/test_logged_in_bot.py
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /test/test_logged_in_bot.py
|
| 2 |
+
"""
|
| 3 |
+
Tests for logged_in_bot.tools (no Azure required).
|
| 4 |
+
Run: pytest -q
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import pytest
|
| 9 |
+
|
| 10 |
+
from logged_in_bot import tools as L
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_help_route_and_reply():
|
| 14 |
+
resp = L.handle_logged_in_turn("help", history=[], user=None)
|
| 15 |
+
assert isinstance(resp, dict)
|
| 16 |
+
assert "I can:" in resp["reply"]
|
| 17 |
+
assert resp["meta"]["intent"] == "help"
|
| 18 |
+
assert "sentiment" in resp["meta"] # attached even in help path
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_echo_payload():
|
| 22 |
+
resp = L.handle_logged_in_turn("echo hello world", history=[], user=None)
|
| 23 |
+
assert resp["reply"] == "hello world"
|
| 24 |
+
assert resp["meta"]["intent"] == "echo"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_summarize_uses_first_sentence():
|
| 28 |
+
text = "This is the first sentence. This is the second sentence."
|
| 29 |
+
resp = L.handle_logged_in_turn(f"summarize {text}", history=[], user=None)
|
| 30 |
+
# naive summarizer returns the first sentence (possibly truncated)
|
| 31 |
+
assert "first sentence" in resp["reply"]
|
| 32 |
+
assert resp["meta"]["intent"] == "summarize"
|
| 33 |
+
assert "sentiment" in resp["meta"] # sentiment computed on source text
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_empty_input_prompts_user():
|
| 37 |
+
resp = L.handle_logged_in_turn("", history=[], user=None)
|
| 38 |
+
assert "Please type" in resp["reply"]
|
| 39 |
+
assert resp["meta"]["intent"] == "empty"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_general_chat_fallback_and_sentiment():
|
| 43 |
+
resp = L.handle_logged_in_turn("I love this project!", history=[], user=None)
|
| 44 |
+
assert isinstance(resp["reply"], str) and len(resp["reply"]) > 0
|
| 45 |
+
# sentiment present; backend may be "local" or "none" depending on env
|
| 46 |
+
sent = resp["meta"].get("sentiment", {})
|
| 47 |
+
assert sent.get("label") in {"positive", "neutral", "negative", None}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_optional_redaction_is_honored(monkeypatch):
|
| 51 |
+
# Monkeypatch optional redactor to simulate PII masking
|
| 52 |
+
monkeypatch.setattr(L, "pii_redact", lambda s: s.replace("555-1234", "[REDACTED]"), raising=False)
|
| 53 |
+
resp = L.handle_logged_in_turn("echo call me at 555-1234", history=[], user=None)
|
| 54 |
+
assert resp["meta"]["redacted"] is True
|
| 55 |
+
assert resp["reply"] == "call me at [REDACTED]"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_input_length_cap(monkeypatch):
|
| 59 |
+
# Cap input length to 10 chars; ensure ellipsis added
|
| 60 |
+
monkeypatch.setenv("MAX_INPUT_CHARS", "10")
|
| 61 |
+
long = "echo 1234567890ABCDEFGHIJ"
|
| 62 |
+
resp = L.handle_logged_in_turn(long, history=[], user=None)
|
| 63 |
+
# reply is payload of redacted/sanitized text; should end with ellipsis
|
| 64 |
+
assert resp["reply"].endswith("β¦") or resp["reply"].endswith("...") # handle different ellipsis if changed
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def test_history_pass_through_shape():
|
| 68 |
+
# History should be accepted and not crash; we don't deeply inspect here
|
| 69 |
+
hist = [("user", "prev"), ("bot", "ok")]
|
| 70 |
+
resp = L.handle_logged_in_turn("echo ping", history=hist, user={"id": "u1"})
|
| 71 |
+
assert resp["reply"] == "ping"
|
| 72 |
+
assert isinstance(resp["meta"], dict)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@pytest.mark.parametrize("msg,expected_intent", [
|
| 76 |
+
("help", "help"),
|
| 77 |
+
("echo abc", "echo"),
|
| 78 |
+
("summarize One. Two.", "summarize"),
|
| 79 |
+
("random chat", "chat"),
|
| 80 |
+
])
|
| 81 |
+
def test_intent_detection_smoke(msg, expected_intent):
|
| 82 |
+
r = L.handle_logged_in_turn(msg, history=[], user=None)
|
| 83 |
+
assert r["meta"]["intent"] == expected_intent
|
| 84 |
+
|
tests/test_memory.py
CHANGED
|
@@ -1,2 +1,95 @@
|
|
| 1 |
# /test/test_memory.py
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /test/test_memory.py
|
| 2 |
+
"""
|
| 3 |
+
Tests for memory.sessions
|
| 4 |
+
Run: pytest -q
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from memory import sessions as S
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_create_and_append_history():
|
| 14 |
+
store = S.SessionStore(ttl_seconds=None, max_history=10)
|
| 15 |
+
sess = store.create(user_id="u1")
|
| 16 |
+
assert sess.session_id
|
| 17 |
+
sid = sess.session_id
|
| 18 |
+
|
| 19 |
+
store.append_user(sid, "hello")
|
| 20 |
+
store.append_bot(sid, "hi there")
|
| 21 |
+
hist = store.get_history(sid)
|
| 22 |
+
assert hist == [("user", "hello"), ("bot", "hi there")]
|
| 23 |
+
|
| 24 |
+
# ensure timestamps update
|
| 25 |
+
before = sess.updated_at
|
| 26 |
+
store.append_user(sid, "next")
|
| 27 |
+
assert store.get(sid).updated_at >= before
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_max_history_cap():
|
| 31 |
+
store = S.SessionStore(ttl_seconds=None, max_history=3)
|
| 32 |
+
s = store.create()
|
| 33 |
+
sid = s.session_id
|
| 34 |
+
|
| 35 |
+
# 4 appends β only last 3 kept
|
| 36 |
+
store.append_user(sid, "a")
|
| 37 |
+
store.append_bot(sid, "b")
|
| 38 |
+
store.append_user(sid, "c")
|
| 39 |
+
store.append_bot(sid, "d")
|
| 40 |
+
hist = store.get_history(sid)
|
| 41 |
+
assert hist == [("bot", "b"), ("user", "c"), ("bot", "d")]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_ttl_sweep_expires_old_sessions():
|
| 45 |
+
store = S.SessionStore(ttl_seconds=0) # expire immediately
|
| 46 |
+
s1 = store.create()
|
| 47 |
+
s2 = store.create()
|
| 48 |
+
# Nudge updated_at into the past
|
| 49 |
+
store._sessions[s1.session_id].updated_at -= 10
|
| 50 |
+
store._sessions[s2.session_id].updated_at -= 10
|
| 51 |
+
|
| 52 |
+
removed = store.sweep()
|
| 53 |
+
assert removed >= 1
|
| 54 |
+
# After sweep, remaining sessions (if any) must be fresh
|
| 55 |
+
for sid in store.all_ids():
|
| 56 |
+
assert not store._expired(store.get(sid))
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def test_key_value_store_helpers():
|
| 60 |
+
store = S.SessionStore(ttl_seconds=None)
|
| 61 |
+
s = store.create()
|
| 62 |
+
sid = s.session_id
|
| 63 |
+
|
| 64 |
+
store.set(sid, "mode", "anonymous")
|
| 65 |
+
store.set(sid, "counter", 1)
|
| 66 |
+
assert store.get_value(sid, "mode") == "anonymous"
|
| 67 |
+
assert store.data_dict(sid)["counter"] == 1
|
| 68 |
+
|
| 69 |
+
# get_value default
|
| 70 |
+
assert store.get_value(sid, "missing", default="x") == "x"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_persistence_save_and_load(tmp_path: Path):
|
| 74 |
+
p = tmp_path / "sess.json"
|
| 75 |
+
|
| 76 |
+
st1 = S.SessionStore(ttl_seconds=None)
|
| 77 |
+
s = st1.create(user_id="uX")
|
| 78 |
+
st1.append_user(s.session_id, "hello")
|
| 79 |
+
st1.append_bot(s.session_id, "hi")
|
| 80 |
+
st1.save(p)
|
| 81 |
+
|
| 82 |
+
st2 = S.SessionStore.load(p)
|
| 83 |
+
hist2 = st2.get_history(s.session_id)
|
| 84 |
+
assert hist2 == [("user", "hello"), ("bot", "hi")]
|
| 85 |
+
assert st2.get(s.session_id).user_id == "uX"
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_module_level_singleton_and_helpers():
|
| 89 |
+
s = S.new_session(user_id="alice")
|
| 90 |
+
sid = s.session_id
|
| 91 |
+
S.append_user(sid, "hey")
|
| 92 |
+
S.append_bot(sid, "hello!")
|
| 93 |
+
assert S.history(sid)[-2:] == [("user", "hey"), ("bot", "hello!")]
|
| 94 |
+
S.set_value(sid, "flag", True)
|
| 95 |
+
assert S.get_value(sid, "flag") is True
|
tests/test_nlu.py
CHANGED
|
@@ -1,2 +1,46 @@
|
|
| 1 |
# /test/test_nlu.py
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# /test/test_nlu.py
|
| 2 |
+
"""
|
| 3 |
+
Basic tests for the NLU pipeline and router.
|
| 4 |
+
Run with: pytest -q
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from nlu import pipeline, router
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_pipeline_greeting():
|
| 13 |
+
out = pipeline.analyze("Hello there")
|
| 14 |
+
assert out["intent"] == "greeting"
|
| 15 |
+
assert out["confidence"] > 0.5
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_pipeline_general():
|
| 19 |
+
out = pipeline.analyze("completely random utterance")
|
| 20 |
+
assert out["intent"] == "general"
|
| 21 |
+
assert "entities" in out
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_router_route_and_respond():
|
| 25 |
+
# Route a help query
|
| 26 |
+
r = router.route("Can you help me?")
|
| 27 |
+
assert r["intent"] == "help"
|
| 28 |
+
assert r["action"] == "HELP"
|
| 29 |
+
|
| 30 |
+
reply = router.respond("Can you help me?")
|
| 31 |
+
assert isinstance(reply, str)
|
| 32 |
+
assert "help" in reply.lower()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_router_sentiment_positive():
|
| 36 |
+
r = router.route("I love this bot!")
|
| 37 |
+
assert r["intent"] == "sentiment_positive"
|
| 38 |
+
reply = router.respond("I love this bot!")
|
| 39 |
+
assert "glad" in reply.lower() or "hear" in reply.lower()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_router_goodbye():
|
| 43 |
+
r = router.route("bye")
|
| 44 |
+
assert r["action"] == "GOODBYE"
|
| 45 |
+
reply = router.respond("bye")
|
| 46 |
+
assert "goodbye" in reply.lower()
|
tests/test_retriever.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_retriever.py
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from memory.rag.data.indexer import TfidfIndex, DocMeta
|
| 4 |
+
from memory.rag.data.retriever import retrieve, Filters
|
| 5 |
+
|
| 6 |
+
def _add(idx, did, text, title=None, tags=None):
|
| 7 |
+
meta = DocMeta(doc_id=did, source="inline", title=title, tags=tags)
|
| 8 |
+
idx.add_text(did, text, meta)
|
| 9 |
+
|
| 10 |
+
def test_retrieve_passage(tmp_path: Path, monkeypatch):
|
| 11 |
+
# Build tiny in-memory index and save
|
| 12 |
+
from memory.rag.data.indexer import DEFAULT_INDEX_PATH
|
| 13 |
+
p = tmp_path / "idx.json"
|
| 14 |
+
from memory.rag.data.indexer import TfidfIndex
|
| 15 |
+
idx = TfidfIndex()
|
| 16 |
+
_add(idx, "d1", "Rules for an anonymous chatbot are simple and fast.", title="Design", tags=["doc","slide"])
|
| 17 |
+
_add(idx, "d2", "This document explains retrieval and index search.", title="RAG", tags=["doc"])
|
| 18 |
+
idx.save(p)
|
| 19 |
+
|
| 20 |
+
# Run retrieval against this saved index
|
| 21 |
+
res = retrieve("anonymous chatbot rules", k=2, index_path=p)
|
| 22 |
+
assert res and any("anonymous" in r.text.lower() for r in res)
|
| 23 |
+
|
| 24 |
+
def test_filters(tmp_path: Path):
|
| 25 |
+
from memory.rag.data.indexer import TfidfIndex
|
| 26 |
+
idx = TfidfIndex()
|
| 27 |
+
_add(idx, "a", "hello world", title="Alpha", tags=["doc","slide"])
|
| 28 |
+
_add(idx, "b", "hello world", title="Beta", tags=["doc"])
|
| 29 |
+
p = tmp_path / "idx.json"
|
| 30 |
+
idx.save(p)
|
| 31 |
+
|
| 32 |
+
f = Filters(title_contains="alpha", require_tags=["doc","slide"])
|
| 33 |
+
res = retrieve("hello", k=5, index_path=p, filters=f)
|
| 34 |
+
assert len(res) == 1 and res[0].title == "Alpha"
|
tests/test_sessions.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_sessions.py
|
| 2 |
+
from memory.sessions import SessionStore
|
| 3 |
+
|
| 4 |
+
def test_create_and_history():
|
| 5 |
+
st = SessionStore(ttl_seconds=None, max_history=3)
|
| 6 |
+
s = st.create(user_id="u1")
|
| 7 |
+
st.append_user(s.session_id, "a")
|
| 8 |
+
st.append_bot(s.session_id, "b")
|
| 9 |
+
st.append_user(s.session_id, "c")
|
| 10 |
+
st.append_bot(s.session_id, "d") # caps to last 3
|
| 11 |
+
h = st.get_history(s.session_id)
|
| 12 |
+
assert h == [("bot","b"), ("user","c"), ("bot","d")]
|
| 13 |
+
|
| 14 |
+
def test_save_load(tmp_path):
|
| 15 |
+
st = SessionStore(ttl_seconds=None)
|
| 16 |
+
s = st.create()
|
| 17 |
+
st.append_user(s.session_id, "hello")
|
| 18 |
+
p = tmp_path / "sess.json"
|
| 19 |
+
st.save(p)
|
| 20 |
+
st2 = SessionStore.load(p)
|
| 21 |
+
assert st2.get_history(s.session_id)[0] == ("user","hello")
|
tree.txt
CHANGED
|
@@ -11,9 +11,6 @@ C:\Users\User\Agentic-Chat-bot-
|
|
| 11 |
β βββ handler.py
|
| 12 |
β βββ rules.py
|
| 13 |
βββ app
|
| 14 |
-
β βββ app
|
| 15 |
-
β β βββ app.py
|
| 16 |
-
β β βββ routes.py
|
| 17 |
β βββ assets
|
| 18 |
β β βββ html
|
| 19 |
β β βββ agenticcore_frontend.html
|
|
@@ -36,7 +33,6 @@ C:\Users\User\Agentic-Chat-bot-
|
|
| 36 |
β βββ architecture.md
|
| 37 |
β βββ design.md
|
| 38 |
β βββ DEV_DOC.md
|
| 39 |
-
β βββ flowchart.png
|
| 40 |
β βββ results.md
|
| 41 |
βββ examples
|
| 42 |
β βββ example.py
|
|
@@ -84,10 +80,13 @@ C:\Users\User\Agentic-Chat-bot-
|
|
| 84 |
β βββ smoke_test.py
|
| 85 |
β βββ test_anon_bot.py
|
| 86 |
β βββ test_guardrails.py
|
|
|
|
| 87 |
β βββ test_logged_in_bot.py
|
| 88 |
β βββ test_memory.py
|
| 89 |
β βββ test_nlu.py
|
| 90 |
-
β
|
|
|
|
|
|
|
| 91 |
βββ tools
|
| 92 |
β βββ quick_sanity.py
|
| 93 |
βββ .gitignore
|
|
|
|
| 11 |
β βββ handler.py
|
| 12 |
β βββ rules.py
|
| 13 |
βββ app
|
|
|
|
|
|
|
|
|
|
| 14 |
β βββ assets
|
| 15 |
β β βββ html
|
| 16 |
β β βββ agenticcore_frontend.html
|
|
|
|
| 33 |
β βββ architecture.md
|
| 34 |
β βββ design.md
|
| 35 |
β βββ DEV_DOC.md
|
|
|
|
| 36 |
β βββ results.md
|
| 37 |
βββ examples
|
| 38 |
β βββ example.py
|
|
|
|
| 80 |
β βββ smoke_test.py
|
| 81 |
β βββ test_anon_bot.py
|
| 82 |
β βββ test_guardrails.py
|
| 83 |
+
β βββ test_indexer.py
|
| 84 |
β βββ test_logged_in_bot.py
|
| 85 |
β βββ test_memory.py
|
| 86 |
β βββ test_nlu.py
|
| 87 |
+
β βββ test_retriever.py
|
| 88 |
+
β βββ test_routes.py
|
| 89 |
+
β βββ test_sessions.py
|
| 90 |
βββ tools
|
| 91 |
β βββ quick_sanity.py
|
| 92 |
βββ .gitignore
|